OLD | NEW |
---|---|
(Empty) | |
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file | |
2 // for details. All rights reserved. Use of this source code is governed by a | |
3 // BSD-style license that can be found in the LICENSE file. | |
4 | |
5 #include "vm/regexp.h" | |
6 | |
7 #include "vm/dart_entry.h" | |
8 #include "vm/regexp_assembler.h" | |
9 #include "vm/regexp_ast.h" | |
10 #include "vm/unibrow-inl.h" | |
11 #include "vm/unicode.h" | |
12 #include "vm/symbols.h" | |
13 | |
14 #define I isolate() | |
15 #define CI compiler->isolate() | |
16 | |
17 namespace dart { | |
18 | |
19 DECLARE_FLAG(bool, trace_irregexp); | |
20 | |
21 #define DEFINE_ACCEPT(Type) \ | |
22 void Type##Node::Accept(NodeVisitor* visitor) { \ | |
23 visitor->Visit##Type(this); \ | |
24 } | |
25 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) | |
26 #undef DEFINE_ACCEPT | |
27 | |
28 | |
29 // Default to generating optimized regexp code. | |
30 static const bool kRegexpOptimization = true; | |
31 | |
32 // More makes code generation slower, less makes V8 benchmark score lower. | |
33 static const intptr_t kMaxLookaheadForBoyerMoore = 8; | |
34 | |
35 // In a 3-character pattern you can maximally step forwards 3 characters | |
36 // at a time, which is not always enough to pay for the extra logic. | |
37 static const intptr_t kPatternTooShortForBoyerMoore = 2; | |
38 | |
39 // The '2' variant has inclusive from and exclusive to. | |
40 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12, | |
41 // which include WhiteSpace (7.2) or LineTerminator (7.3) values. | |
42 static const intptr_t kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, | |
43 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, | |
44 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, | |
45 0xFEFF, 0xFF00, 0x10000 }; | |
46 static const intptr_t kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges); | |
47 static const intptr_t kWordRanges[] = { | |
48 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 }; | |
49 static const intptr_t kWordRangeCount = ARRAY_SIZE(kWordRanges); | |
50 static const intptr_t kDigitRanges[] = { '0', '9' + 1, 0x10000 }; | |
51 static const intptr_t kDigitRangeCount = ARRAY_SIZE(kDigitRanges); | |
52 static const intptr_t kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 }; | |
53 static const intptr_t kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges); | |
54 static const intptr_t kLineTerminatorRanges[] = { | |
55 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, 0x10000 }; | |
56 static const intptr_t kLineTerminatorRangeCount = | |
57 ARRAY_SIZE(kLineTerminatorRanges); | |
58 | |
59 | |
60 static inline void PrintUtf16(uint16_t c) { | |
61 const char* format = (0x20 <= c && c <= 0x7F) ? | |
62 "%c" : (c <= 0xff) ? "\\x%02x" : "\\u%04x"; | |
63 OS::Print(format, c); | |
64 } | |
65 | |
66 | |
67 // We need to check for the following characters: 0x39c 0x3bc 0x178. | |
68 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { | |
69 // TODO(dcarney): this could be a lot more efficient. | |
70 return range.Contains(0x39c) || | |
71 range.Contains(0x3bc) || range.Contains(0x178); | |
72 } | |
73 | |
74 | |
75 static bool RangesContainLatin1Equivalents( | |
76 ZoneGrowableArray<CharacterRange>* ranges) { | |
77 for (intptr_t i = 0; i < ranges->length(); i++) { | |
78 // TODO(dcarney): this could be a lot more efficient. | |
79 if (RangeContainsLatin1Equivalents(ranges->At(i))) return true; | |
80 } | |
81 return false; | |
82 } | |
83 | |
84 static uint16_t ConvertNonLatin1ToLatin1(uint16_t c) { | |
85 ASSERT(c > Symbols::kMaxOneCharCodeSymbol); | |
86 switch (c) { | |
87 // This are equivalent characters in unicode. | |
88 case 0x39c: | |
89 case 0x3bc: | |
90 return 0xb5; | |
91 // This is an uppercase of a Latin-1 character | |
92 // outside of Latin-1. | |
93 case 0x178: | |
94 return 0xff; | |
95 } | |
96 return 0; | |
97 } | |
98 | |
99 | |
100 class VisitMarker : public ValueObject { | |
101 public: | |
102 explicit VisitMarker(NodeInfo* info) : info_(info) { | |
103 ASSERT(!info->visited); | |
104 info->visited = true; | |
105 } | |
106 ~VisitMarker() { | |
107 info_->visited = false; | |
108 } | |
109 private: | |
110 NodeInfo* info_; | |
111 }; | |
112 | |
113 | |
114 class FrequencyCollator : public ValueObject { | |
115 public: | |
116 FrequencyCollator() : total_samples_(0) { | |
117 for (intptr_t i = 0; i < RegExpMacroAssembler::kTableSize; i++) { | |
118 frequencies_[i] = CharacterFrequency(i); | |
119 } | |
120 } | |
121 | |
122 void CountCharacter(intptr_t character) { | |
123 intptr_t index = (character & RegExpMacroAssembler::kTableMask); | |
124 frequencies_[index].Increment(); | |
125 total_samples_++; | |
126 } | |
127 | |
128 // Does not measure in percent, but rather per-128 (the table size from the | |
129 // regexp macro assembler). | |
130 intptr_t Frequency(intptr_t in_character) { | |
131 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character); | |
132 if (total_samples_ < 1) return 1; // Division by zero. | |
133 intptr_t freq_in_per128 = | |
134 (frequencies_[in_character].counter() * 128) / total_samples_; | |
135 return freq_in_per128; | |
136 } | |
137 | |
138 private: | |
139 class CharacterFrequency { | |
140 public: | |
141 CharacterFrequency() : counter_(0), character_(-1) { } | |
142 explicit CharacterFrequency(intptr_t character) | |
143 : counter_(0), character_(character) { } | |
144 | |
145 void Increment() { counter_++; } | |
146 intptr_t counter() { return counter_; } | |
147 intptr_t character() { return character_; } | |
148 | |
149 private: | |
150 intptr_t counter_; | |
151 intptr_t character_; | |
152 | |
153 DISALLOW_ALLOCATION(); | |
154 }; | |
155 | |
156 | |
157 private: | |
158 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; | |
159 intptr_t total_samples_; | |
160 }; | |
161 | |
162 | |
163 class RegExpCompiler : public ValueObject { | |
164 public: | |
165 RegExpCompiler(intptr_t capture_count, | |
166 bool ignore_case, | |
167 intptr_t specialization_cid); | |
168 | |
169 intptr_t AllocateRegister() { | |
170 return next_register_++; | |
171 } | |
172 | |
173 RegExpEngine::CompilationResult Assemble(IRRegExpMacroAssembler* assembler, | |
174 RegExpNode* start, | |
175 intptr_t capture_count, | |
176 const String& pattern); | |
177 | |
178 inline void AddWork(RegExpNode* node) { work_list_->Add(node); } | |
179 | |
180 static const intptr_t kImplementationOffset = 0; | |
181 static const intptr_t kNumberOfRegistersOffset = 0; | |
182 static const intptr_t kCodeOffset = 1; | |
183 | |
184 IRRegExpMacroAssembler* macro_assembler() { return macro_assembler_; } | |
185 EndNode* accept() { return accept_; } | |
186 | |
187 static const intptr_t kMaxRecursion = 100; | |
188 inline intptr_t recursion_depth() { return recursion_depth_; } | |
189 inline void IncrementRecursionDepth() { recursion_depth_++; } | |
190 inline void DecrementRecursionDepth() { recursion_depth_--; } | |
191 | |
192 void SetRegExpTooBig() { reg_exp_too_big_ = true; } | |
193 | |
194 inline bool ignore_case() { return ignore_case_; } | |
195 inline bool ascii() const { | |
196 return (specialization_cid_ == kOneByteStringCid || | |
197 specialization_cid_ == kExternalOneByteStringCid); | |
198 } | |
199 inline intptr_t specialization_cid() { return specialization_cid_; } | |
200 FrequencyCollator* frequency_collator() { return &frequency_collator_; } | |
201 | |
202 intptr_t current_expansion_factor() { return current_expansion_factor_; } | |
203 void set_current_expansion_factor(intptr_t value) { | |
204 current_expansion_factor_ = value; | |
205 } | |
206 | |
207 Isolate* isolate() const { return isolate_; } | |
208 | |
209 static const intptr_t kNoRegister = -1; | |
210 | |
211 private: | |
212 EndNode* accept_; | |
213 intptr_t next_register_; | |
214 ZoneGrowableArray<RegExpNode*>* work_list_; | |
215 intptr_t recursion_depth_; | |
216 IRRegExpMacroAssembler* macro_assembler_; | |
217 bool ignore_case_; | |
218 intptr_t specialization_cid_; | |
219 bool reg_exp_too_big_; | |
220 intptr_t current_expansion_factor_; | |
221 FrequencyCollator frequency_collator_; | |
222 Isolate* isolate_; | |
223 }; | |
224 | |
225 | |
226 class RecursionCheck : public ValueObject { | |
227 public: | |
228 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { | |
229 compiler->IncrementRecursionDepth(); | |
230 } | |
231 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } | |
232 private: | |
233 RegExpCompiler* compiler_; | |
234 }; | |
235 | |
236 | |
237 // Scoped object to keep track of how much we unroll quantifier loops in the | |
238 // regexp graph generator. | |
239 class RegExpExpansionLimiter : public ValueObject { | |
240 public: | |
241 static const intptr_t kMaxExpansionFactor = 6; | |
242 RegExpExpansionLimiter(RegExpCompiler* compiler, intptr_t factor) | |
243 : compiler_(compiler), | |
244 saved_expansion_factor_(compiler->current_expansion_factor()), | |
245 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { | |
246 ASSERT(factor > 0); | |
247 if (ok_to_expand_) { | |
248 if (factor > kMaxExpansionFactor) { | |
249 // Avoid integer overflow of the current expansion factor. | |
250 ok_to_expand_ = false; | |
251 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1); | |
252 } else { | |
253 intptr_t new_factor = saved_expansion_factor_ * factor; | |
254 ok_to_expand_ = (new_factor <= kMaxExpansionFactor); | |
255 compiler->set_current_expansion_factor(new_factor); | |
256 } | |
257 } | |
258 } | |
259 | |
260 ~RegExpExpansionLimiter() { | |
261 compiler_->set_current_expansion_factor(saved_expansion_factor_); | |
262 } | |
263 | |
264 bool ok_to_expand() { return ok_to_expand_; } | |
265 | |
266 private: | |
267 RegExpCompiler* compiler_; | |
268 intptr_t saved_expansion_factor_; | |
269 bool ok_to_expand_; | |
270 | |
271 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); | |
272 }; | |
273 | |
274 | |
275 // Node generation ------------------------------------------------------------- | |
276 | |
277 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, | |
278 RegExpNode* on_success) { | |
279 ZoneGrowableArray<TextElement>* elms = | |
280 new(CI) ZoneGrowableArray<TextElement>(1); | |
281 elms->Add(TextElement::Atom(this)); | |
282 return new(CI) TextNode(elms, on_success); | |
283 } | |
284 | |
285 | |
286 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, | |
287 RegExpNode* on_success) { | |
288 ZoneGrowableArray<TextElement>* elms = | |
289 new(CI) ZoneGrowableArray<TextElement>(1); | |
290 for (intptr_t i = 0; i < elements()->length(); i++) { | |
291 elms->Add(elements()->At(i)); | |
292 } | |
293 return new(CI) TextNode(elms, on_success); | |
294 } | |
295 | |
296 | |
297 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, | |
298 RegExpNode* on_success) { | |
299 return new(CI) TextNode(this, on_success); | |
300 } | |
301 | |
302 | |
303 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, | |
304 RegExpNode* on_success) { | |
305 ZoneGrowableArray<RegExpTree*>* alternatives = this->alternatives(); | |
306 intptr_t length = alternatives->length(); | |
307 ChoiceNode* result = | |
308 new(CI) ChoiceNode(length, CI); | |
309 for (intptr_t i = 0; i < length; i++) { | |
310 GuardedAlternative alternative(alternatives->At(i)->ToNode(compiler, | |
311 on_success)); | |
312 result->AddAlternative(alternative); | |
313 } | |
314 return result; | |
315 } | |
316 | |
317 | |
318 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, | |
319 RegExpNode* on_success) { | |
320 return ToNode(min(), | |
321 max(), | |
322 is_greedy(), | |
323 body(), | |
324 compiler, | |
325 on_success); | |
326 } | |
327 | |
328 | |
329 RegExpNode* RegExpQuantifier::ToNode(intptr_t min, | |
330 intptr_t max, | |
331 bool is_greedy, | |
332 RegExpTree* body, | |
333 RegExpCompiler* compiler, | |
334 RegExpNode* on_success, | |
335 bool not_at_start) { | |
336 // x{f, t} becomes this: | |
337 // | |
338 // (r++)<-. | |
339 // | ` | |
340 // | (x) | |
341 // v ^ | |
342 // (r=0)-->(?)---/ [if r < t] | |
343 // | | |
344 // [if r >= f] \----> ... | |
345 // | |
346 | |
347 // 15.10.2.5 RepeatMatcher algorithm. | |
348 // The parser has already eliminated the case where max is 0. In the case | |
349 // where max_match is zero the parser has removed the quantifier if min was | |
350 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. | |
351 | |
352 // If we know that we cannot match zero length then things are a little | |
353 // simpler since we don't need to make the special zero length match check | |
354 // from step 2.1. If the min and max are small we can unroll a little in | |
355 // this case. | |
356 // Unroll (foo)+ and (foo){3,} | |
357 static const intptr_t kMaxUnrolledMinMatches = 3; | |
358 // Unroll (foo)? and (foo){x,3} | |
359 static const intptr_t kMaxUnrolledMaxMatches = 3; | |
360 if (max == 0) return on_success; // This can happen due to recursion. | |
361 bool body_can_be_empty = (body->min_match() == 0); | |
362 intptr_t body_start_reg = RegExpCompiler::kNoRegister; | |
363 Interval capture_registers = body->CaptureRegisters(); | |
364 bool needs_capture_clearing = !capture_registers.is_empty(); | |
365 Isolate* isolate = compiler->isolate(); | |
366 | |
367 if (body_can_be_empty) { | |
368 body_start_reg = compiler->AllocateRegister(); | |
369 } else if (kRegexpOptimization && !needs_capture_clearing) { | |
370 // Only unroll if there are no captures and the body can't be | |
371 // empty. | |
372 { | |
373 RegExpExpansionLimiter limiter( | |
374 compiler, min + ((max != min) ? 1 : 0)); | |
375 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) { | |
376 intptr_t new_max = (max == kInfinity) ? max : max - min; | |
377 // Recurse once to get the loop or optional matches after the fixed | |
378 // ones. | |
379 RegExpNode* answer = ToNode( | |
380 0, new_max, is_greedy, body, compiler, on_success, true); | |
381 // Unroll the forced matches from 0 to min. This can cause chains of | |
382 // TextNodes (which the parser does not generate). These should be | |
383 // combined if it turns out they hinder good code generation. | |
384 for (intptr_t i = 0; i < min; i++) { | |
385 answer = body->ToNode(compiler, answer); | |
386 } | |
387 return answer; | |
388 } | |
389 } | |
390 if (max <= kMaxUnrolledMaxMatches && min == 0) { | |
391 ASSERT(max > 0); // Due to the 'if' above. | |
392 RegExpExpansionLimiter limiter(compiler, max); | |
393 if (limiter.ok_to_expand()) { | |
394 // Unroll the optional matches up to max. | |
395 RegExpNode* answer = on_success; | |
396 for (intptr_t i = 0; i < max; i++) { | |
397 ChoiceNode* alternation = new(isolate) ChoiceNode(2, isolate); | |
398 if (is_greedy) { | |
399 alternation->AddAlternative( | |
400 GuardedAlternative(body->ToNode(compiler, answer))); | |
401 alternation->AddAlternative(GuardedAlternative(on_success)); | |
402 } else { | |
403 alternation->AddAlternative(GuardedAlternative(on_success)); | |
404 alternation->AddAlternative( | |
405 GuardedAlternative(body->ToNode(compiler, answer))); | |
406 } | |
407 answer = alternation; | |
408 if (not_at_start) alternation->set_not_at_start(); | |
409 } | |
410 return answer; | |
411 } | |
412 } | |
413 } | |
414 bool has_min = min > 0; | |
415 bool has_max = max < RegExpTree::kInfinity; | |
416 bool needs_counter = has_min || has_max; | |
417 intptr_t reg_ctr = needs_counter | |
418 ? compiler->AllocateRegister() | |
419 : RegExpCompiler::kNoRegister; | |
420 LoopChoiceNode* center = new(isolate) LoopChoiceNode(body->min_match() == 0, | |
421 isolate); | |
422 if (not_at_start) center->set_not_at_start(); | |
423 RegExpNode* loop_return = needs_counter | |
424 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center)) | |
425 : static_cast<RegExpNode*>(center); | |
426 if (body_can_be_empty) { | |
427 // If the body can be empty we need to check if it was and then | |
428 // backtrack. | |
429 loop_return = ActionNode::EmptyMatchCheck(body_start_reg, | |
430 reg_ctr, | |
431 min, | |
432 loop_return); | |
433 } | |
434 RegExpNode* body_node = body->ToNode(compiler, loop_return); | |
435 if (body_can_be_empty) { | |
436 // If the body can be empty we need to store the start position | |
437 // so we can bail out if it was empty. | |
438 body_node = ActionNode::StorePosition(body_start_reg, false, body_node); | |
439 } | |
440 if (needs_capture_clearing) { | |
441 // Before entering the body of this loop we need to clear captures. | |
442 body_node = ActionNode::ClearCaptures(capture_registers, body_node); | |
443 } | |
444 GuardedAlternative body_alt(body_node); | |
445 if (has_max) { | |
446 Guard* body_guard = | |
447 new(isolate) Guard(reg_ctr, Guard::LT, max); | |
448 body_alt.AddGuard(body_guard, isolate); | |
449 } | |
450 GuardedAlternative rest_alt(on_success); | |
451 if (has_min) { | |
452 Guard* rest_guard = new(isolate) Guard(reg_ctr, Guard::GEQ, min); | |
453 rest_alt.AddGuard(rest_guard, isolate); | |
454 } | |
455 if (is_greedy) { | |
456 center->AddLoopAlternative(body_alt); | |
457 center->AddContinueAlternative(rest_alt); | |
458 } else { | |
459 center->AddContinueAlternative(rest_alt); | |
460 center->AddLoopAlternative(body_alt); | |
461 } | |
462 if (needs_counter) { | |
463 return ActionNode::SetRegister(reg_ctr, 0, center); | |
464 } else { | |
465 return center; | |
466 } | |
467 } | |
468 | |
469 | |
470 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, | |
471 RegExpNode* on_success) { | |
472 switch (assertion_type()) { | |
473 case START_OF_LINE: | |
474 return AssertionNode::AfterNewline(on_success); | |
475 case START_OF_INPUT: | |
476 return AssertionNode::AtStart(on_success); | |
477 case BOUNDARY: | |
478 return AssertionNode::AtBoundary(on_success); | |
479 case NON_BOUNDARY: | |
480 return AssertionNode::AtNonBoundary(on_success); | |
481 case END_OF_INPUT: | |
482 return AssertionNode::AtEnd(on_success); | |
483 case END_OF_LINE: { | |
484 // Compile $ in multiline regexps as an alternation with a positive | |
485 // lookahead in one side and an end-of-input on the other side. | |
486 // We need two registers for the lookahead. | |
487 intptr_t stack_pointer_register = compiler->AllocateRegister(); | |
488 intptr_t position_register = compiler->AllocateRegister(); | |
489 // The ChoiceNode to distinguish between a newline and end-of-input. | |
490 ChoiceNode* result = new ChoiceNode(2, on_success->isolate()); | |
491 // Create a newline atom. | |
492 ZoneGrowableArray<CharacterRange>* newline_ranges = | |
493 new ZoneGrowableArray<CharacterRange>(3); | |
494 CharacterRange::AddClassEscape('n', newline_ranges); | |
495 RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n'); | |
496 TextNode* newline_matcher = new TextNode( | |
497 newline_atom, | |
498 ActionNode::PositiveSubmatchSuccess(stack_pointer_register, | |
499 position_register, | |
500 0, // No captures inside. | |
501 -1, // Ignored if no captures. | |
502 on_success)); | |
503 // Create an end-of-input matcher. | |
504 RegExpNode* end_of_line = ActionNode::BeginSubmatch( | |
505 stack_pointer_register, | |
506 position_register, | |
507 newline_matcher); | |
508 // Add the two alternatives to the ChoiceNode. | |
509 GuardedAlternative eol_alternative(end_of_line); | |
510 result->AddAlternative(eol_alternative); | |
511 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); | |
512 result->AddAlternative(end_alternative); | |
513 return result; | |
514 } | |
515 default: | |
516 UNREACHABLE(); | |
517 } | |
518 return on_success; | |
519 } | |
520 | |
521 | |
522 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, | |
523 RegExpNode* on_success) { | |
524 return new(CI) | |
525 BackReferenceNode(RegExpCapture::StartRegister(index()), | |
526 RegExpCapture::EndRegister(index()), | |
527 on_success); | |
528 } | |
529 | |
530 | |
531 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, | |
532 RegExpNode* on_success) { | |
533 return on_success; | |
534 } | |
535 | |
536 | |
537 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler, | |
538 RegExpNode* on_success) { | |
539 intptr_t stack_pointer_register = compiler->AllocateRegister(); | |
540 intptr_t position_register = compiler->AllocateRegister(); | |
541 | |
542 const intptr_t registers_per_capture = 2; | |
543 const intptr_t register_of_first_capture = 2; | |
544 intptr_t register_count = capture_count_ * registers_per_capture; | |
545 intptr_t register_start = | |
546 register_of_first_capture + capture_from_ * registers_per_capture; | |
547 | |
548 RegExpNode* success; | |
549 if (is_positive()) { | |
550 RegExpNode* node = ActionNode::BeginSubmatch( | |
551 stack_pointer_register, | |
552 position_register, | |
553 body()->ToNode( | |
554 compiler, | |
555 ActionNode::PositiveSubmatchSuccess(stack_pointer_register, | |
556 position_register, | |
557 register_count, | |
558 register_start, | |
559 on_success))); | |
560 return node; | |
561 } else { | |
562 // We use a ChoiceNode for a negative lookahead because it has most of | |
563 // the characteristics we need. It has the body of the lookahead as its | |
564 // first alternative and the expression after the lookahead of the second | |
565 // alternative. If the first alternative succeeds then the | |
566 // NegativeSubmatchSuccess will unwind the stack including everything the | |
567 // choice node set up and backtrack. If the first alternative fails then | |
568 // the second alternative is tried, which is exactly the desired result | |
569 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special | |
570 // ChoiceNode that knows to ignore the first exit when calculating quick | |
571 // checks. | |
572 | |
573 GuardedAlternative body_alt( | |
574 body()->ToNode( | |
575 compiler, | |
576 success = new(CI) NegativeSubmatchSuccess(stack_pointer_register, | |
577 position_register, | |
578 register_count, | |
579 register_start, | |
580 CI))); | |
581 ChoiceNode* choice_node = | |
582 new(CI) NegativeLookaheadChoiceNode(body_alt, | |
583 GuardedAlternative(on_success), | |
584 CI); | |
585 return ActionNode::BeginSubmatch(stack_pointer_register, | |
586 position_register, | |
587 choice_node); | |
588 } | |
589 } | |
590 | |
591 | |
592 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, | |
593 RegExpNode* on_success) { | |
594 return ToNode(body(), index(), compiler, on_success); | |
595 } | |
596 | |
597 | |
598 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, | |
599 intptr_t index, | |
600 RegExpCompiler* compiler, | |
601 RegExpNode* on_success) { | |
602 intptr_t start_reg = RegExpCapture::StartRegister(index); | |
603 intptr_t end_reg = RegExpCapture::EndRegister(index); | |
604 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); | |
605 RegExpNode* body_node = body->ToNode(compiler, store_end); | |
606 return ActionNode::StorePosition(start_reg, true, body_node); | |
607 } | |
608 | |
609 | |
610 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, | |
611 RegExpNode* on_success) { | |
612 ZoneGrowableArray<RegExpTree*>* children = nodes(); | |
613 RegExpNode* current = on_success; | |
614 for (intptr_t i = children->length() - 1; i >= 0; i--) { | |
615 current = children->At(i)->ToNode(compiler, current); | |
616 } | |
617 return current; | |
618 } | |
619 | |
620 | |
621 // ASCII filtering ------------------------------------------------------------- | |
622 | |
623 | |
624 RegExpNode* SeqRegExpNode::FilterSuccessor(intptr_t depth, bool ignore_case) { | |
625 RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case); | |
626 if (next == NULL) return set_replacement(NULL); | |
627 on_success_ = next; | |
628 return set_replacement(this); | |
629 } | |
630 | |
631 | |
632 RegExpNode* SeqRegExpNode::FilterASCII(intptr_t depth, bool ignore_case) { | |
633 if (info()->replacement_calculated) return replacement(); | |
634 if (depth < 0) return this; | |
635 ASSERT(!info()->visited); | |
636 VisitMarker marker(info()); | |
637 return FilterSuccessor(depth - 1, ignore_case); | |
638 } | |
639 | |
640 | |
641 RegExpNode* TextNode::FilterASCII(intptr_t depth, bool ignore_case) { | |
642 if (info()->replacement_calculated) return replacement(); | |
643 if (depth < 0) return this; | |
644 ASSERT(!info()->visited); | |
645 VisitMarker marker(info()); | |
646 intptr_t element_count = elms_->length(); | |
647 for (intptr_t i = 0; i < element_count; i++) { | |
648 TextElement elm = elms_->At(i); | |
649 if (elm.text_type() == TextElement::ATOM) { | |
650 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data(); | |
651 for (intptr_t j = 0; j < quarks->length(); j++) { | |
652 uint16_t c = quarks->At(j); | |
653 if (c <= Symbols::kMaxOneCharCodeSymbol) continue; | |
654 if (!ignore_case) return set_replacement(NULL); | |
655 // Here, we need to check for characters whose upper and lower cases | |
656 // are outside the Latin-1 range. | |
657 uint16_t converted = ConvertNonLatin1ToLatin1(c); | |
658 // Character is outside Latin-1 completely | |
659 if (converted == 0) return set_replacement(NULL); | |
660 // Convert quark to Latin-1 in place. | |
661 (*quarks)[0] = converted; | |
662 } | |
663 } else { | |
664 ASSERT(elm.text_type() == TextElement::CHAR_CLASS); | |
665 RegExpCharacterClass* cc = elm.char_class(); | |
666 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges(); | |
667 if (!CharacterRange::IsCanonical(ranges)) { | |
668 CharacterRange::Canonicalize(ranges); | |
669 } | |
670 // Now they are in order so we only need to look at the first. | |
671 intptr_t range_count = ranges->length(); | |
672 if (cc->is_negated()) { | |
673 if (range_count != 0 && | |
674 ranges->At(0).from() == 0 && | |
675 ranges->At(0).to() >= Symbols::kMaxOneCharCodeSymbol) { | |
676 // This will be handled in a later filter. | |
677 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; | |
678 return set_replacement(NULL); | |
679 } | |
680 } else { | |
681 if (range_count == 0 || | |
682 ranges->At(0).from() > Symbols::kMaxOneCharCodeSymbol) { | |
683 // This will be handled in a later filter. | |
684 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; | |
685 return set_replacement(NULL); | |
686 } | |
687 } | |
688 } | |
689 } | |
690 return FilterSuccessor(depth - 1, ignore_case); | |
691 } | |
692 | |
693 | |
694 RegExpNode* LoopChoiceNode::FilterASCII(intptr_t depth, bool ignore_case) { | |
695 if (info()->replacement_calculated) return replacement(); | |
696 if (depth < 0) return this; | |
697 if (info()->visited) return this; | |
698 { | |
699 VisitMarker marker(info()); | |
700 | |
701 RegExpNode* continue_replacement = | |
702 continue_node_->FilterASCII(depth - 1, ignore_case); | |
703 // If we can't continue after the loop then there is no sense in doing the | |
704 // loop. | |
705 if (continue_replacement == NULL) return set_replacement(NULL); | |
706 } | |
707 | |
708 return ChoiceNode::FilterASCII(depth - 1, ignore_case); | |
709 } | |
710 | |
711 | |
712 RegExpNode* ChoiceNode::FilterASCII(intptr_t depth, bool ignore_case) { | |
713 if (info()->replacement_calculated) return replacement(); | |
714 if (depth < 0) return this; | |
715 if (info()->visited) return this; | |
716 VisitMarker marker(info()); | |
717 intptr_t choice_count = alternatives_->length(); | |
718 | |
719 for (intptr_t i = 0; i < choice_count; i++) { | |
720 GuardedAlternative alternative = alternatives_->At(i); | |
721 if (alternative.guards() != NULL && alternative.guards()->length() != 0) { | |
722 set_replacement(this); | |
723 return this; | |
724 } | |
725 } | |
726 | |
727 intptr_t surviving = 0; | |
728 RegExpNode* survivor = NULL; | |
729 for (intptr_t i = 0; i < choice_count; i++) { | |
730 GuardedAlternative alternative = alternatives_->At(i); | |
731 RegExpNode* replacement = | |
732 alternative.node()->FilterASCII(depth - 1, ignore_case); | |
733 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK. | |
734 if (replacement != NULL) { | |
735 (*alternatives_)[i].set_node(replacement); | |
736 surviving++; | |
737 survivor = replacement; | |
738 } | |
739 } | |
740 if (surviving < 2) return set_replacement(survivor); | |
741 | |
742 set_replacement(this); | |
743 if (surviving == choice_count) { | |
744 return this; | |
745 } | |
746 // Only some of the nodes survived the filtering. We need to rebuild the | |
747 // alternatives list. | |
748 ZoneGrowableArray<GuardedAlternative>* new_alternatives = | |
749 new(I) ZoneGrowableArray<GuardedAlternative>(surviving); | |
750 for (intptr_t i = 0; i < choice_count; i++) { | |
751 RegExpNode* replacement = | |
752 (*alternatives_)[i].node()->FilterASCII(depth - 1, ignore_case); | |
753 if (replacement != NULL) { | |
754 (*alternatives_)[i].set_node(replacement); | |
755 new_alternatives->Add((*alternatives_)[i]); | |
756 } | |
757 } | |
758 alternatives_ = new_alternatives; | |
759 return this; | |
760 } | |
761 | |
762 | |
763 RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(intptr_t depth, | |
764 bool ignore_case) { | |
765 if (info()->replacement_calculated) return replacement(); | |
766 if (depth < 0) return this; | |
767 if (info()->visited) return this; | |
768 VisitMarker marker(info()); | |
769 // Alternative 0 is the negative lookahead, alternative 1 is what comes | |
770 // afterwards. | |
771 RegExpNode* node = (*alternatives_)[1].node(); | |
772 RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case); | |
773 if (replacement == NULL) return set_replacement(NULL); | |
774 (*alternatives_)[1].set_node(replacement); | |
775 | |
776 RegExpNode* neg_node = (*alternatives_)[0].node(); | |
777 RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case); | |
778 // If the negative lookahead is always going to fail then | |
779 // we don't need to check it. | |
780 if (neg_replacement == NULL) return set_replacement(replacement); | |
781 (*alternatives_)[0].set_node(neg_replacement); | |
782 return set_replacement(this); | |
783 } | |
784 | |
785 | |
786 // Code emission --------------------------------------------------------------- | |
787 | |
788 | |
789 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, | |
790 Guard* guard, | |
791 Trace* trace) { | |
792 switch (guard->op()) { | |
793 case Guard::LT: | |
794 ASSERT(!trace->mentions_reg(guard->reg())); | |
795 macro_assembler->IfRegisterGE(guard->reg(), | |
796 guard->value(), | |
797 trace->backtrack()); | |
798 break; | |
799 case Guard::GEQ: | |
800 ASSERT(!trace->mentions_reg(guard->reg())); | |
801 macro_assembler->IfRegisterLT(guard->reg(), | |
802 guard->value(), | |
803 trace->backtrack()); | |
804 break; | |
805 } | |
806 } | |
807 | |
808 | |
809 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { | |
810 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | |
811 | |
812 // Omit flushing the trace. We discard the entire stack frame anyway. | |
813 | |
814 if (!label()->IsBound()) { | |
815 // We are completely independent of the trace, since we ignore it, | |
816 // so this code can be used as the generic version. | |
817 assembler->BindBlock(label()); | |
818 } | |
819 | |
820 // Throw away everything on the backtrack stack since the start | |
821 // of the negative submatch and restore the character position. | |
822 assembler->ReadCurrentPositionFromRegister(current_position_register_); | |
823 assembler->ReadStackPointerFromRegister(stack_pointer_register_); | |
824 if (clear_capture_count_ > 0) { | |
825 // Clear any captures that might have been performed during the success | |
826 // of the body of the negative look-ahead. | |
827 intptr_t clear_capture_end = | |
828 clear_capture_start_ + clear_capture_count_ - 1; | |
829 assembler->ClearRegisters(clear_capture_start_, clear_capture_end); | |
830 } | |
831 // Now that we have unwound the stack we find at the top of the stack the | |
832 // backtrack that the BeginSubmatch node got. | |
833 assembler->Backtrack(); | |
834 } | |
835 | |
836 | |
837 bool Trace::GetStoredPosition(intptr_t reg, intptr_t* cp_offset) { | |
838 ASSERT(*cp_offset == 0); | |
839 for (DeferredAction* action = actions_; | |
840 action != NULL; | |
841 action = action->next()) { | |
842 if (action->Mentions(reg)) { | |
843 if (action->action_type() == ActionNode::STORE_POSITION) { | |
844 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset(); | |
845 return true; | |
846 } else { | |
847 return false; | |
848 } | |
849 } | |
850 } | |
851 return false; | |
852 } | |
853 | |
854 | |
855 // This is called as we come into a loop choice node and some other tricky | |
856 // nodes. It normalizes the state of the code generator to ensure we can | |
857 // generate generic code. | |
858 intptr_t Trace::FindAffectedRegisters(OutSet* affected_registers, | |
859 Isolate* isolate) { | |
860 intptr_t max_register = RegExpCompiler::kNoRegister; | |
861 for (DeferredAction* action = actions_; | |
862 action != NULL; | |
863 action = action->next()) { | |
864 if (action->action_type() == ActionNode::CLEAR_CAPTURES) { | |
865 Interval range = static_cast<DeferredClearCaptures*>(action)->range(); | |
866 for (intptr_t i = range.from(); i <= range.to(); i++) | |
867 affected_registers->Set(i, isolate); | |
868 if (range.to() > max_register) max_register = range.to(); | |
869 } else { | |
870 affected_registers->Set(action->reg(), isolate); | |
871 if (action->reg() > max_register) max_register = action->reg(); | |
872 } | |
873 } | |
874 return max_register; | |
875 } | |
876 | |
877 | |
878 bool Trace::DeferredAction::Mentions(intptr_t that) { | |
879 if (action_type() == ActionNode::CLEAR_CAPTURES) { | |
880 Interval range = static_cast<DeferredClearCaptures*>(this)->range(); | |
881 return range.Contains(that); | |
882 } else { | |
883 return reg() == that; | |
884 } | |
885 } | |
886 | |
887 | |
888 bool Trace::mentions_reg(intptr_t reg) { | |
889 for (DeferredAction* action = actions_; | |
890 action != NULL; | |
891 action = action->next()) { | |
892 if (action->Mentions(reg)) | |
893 return true; | |
894 } | |
895 return false; | |
896 } | |
897 | |
898 | |
899 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, | |
900 intptr_t max_register, | |
901 const OutSet& registers_to_pop, | |
902 const OutSet& registers_to_clear) { | |
903 for (intptr_t reg = max_register; reg >= 0; reg--) { | |
904 if (registers_to_pop.Get(reg)) { | |
905 assembler->PopRegister(reg); | |
906 } else if (registers_to_clear.Get(reg)) { | |
907 intptr_t clear_to = reg; | |
908 while (reg > 0 && registers_to_clear.Get(reg - 1)) { | |
909 reg--; | |
910 } | |
911 assembler->ClearRegisters(reg, clear_to); | |
912 } | |
913 } | |
914 } | |
915 | |
916 | |
917 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, | |
918 intptr_t max_register, | |
919 const OutSet& affected_registers, | |
920 OutSet* registers_to_pop, | |
921 OutSet* registers_to_clear, | |
922 Isolate* isolate) { | |
923 for (intptr_t reg = 0; reg <= max_register; reg++) { | |
924 if (!affected_registers.Get(reg)) { | |
925 continue; | |
926 } | |
927 | |
928 // The chronologically first deferred action in the trace | |
929 // is used to infer the action needed to restore a register | |
930 // to its previous state (or not, if it's safe to ignore it). | |
931 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; | |
932 DeferredActionUndoType undo_action = IGNORE; | |
933 | |
934 intptr_t value = 0; | |
935 bool absolute = false; | |
936 bool clear = false; | |
937 intptr_t store_position = -1; | |
938 // This is a little tricky because we are scanning the actions in reverse | |
939 // historical order (newest first). | |
940 for (DeferredAction* action = actions_; | |
941 action != NULL; | |
942 action = action->next()) { | |
943 if (action->Mentions(reg)) { | |
944 switch (action->action_type()) { | |
945 case ActionNode::SET_REGISTER: { | |
946 Trace::DeferredSetRegister* psr = | |
947 static_cast<Trace::DeferredSetRegister*>(action); | |
948 if (!absolute) { | |
949 value += psr->value(); | |
950 absolute = true; | |
951 } | |
952 // SET_REGISTER is currently only used for newly introduced loop | |
953 // counters. They can have a significant previous value if they | |
954 // occour in a loop. TODO(lrn): Propagate this information, so | |
955 // we can set undo_action to IGNORE if we know there is no value to | |
956 // restore. | |
957 undo_action = RESTORE; | |
958 ASSERT(store_position == -1); | |
959 ASSERT(!clear); | |
960 break; | |
961 } | |
962 case ActionNode::INCREMENT_REGISTER: | |
963 if (!absolute) { | |
964 value++; | |
965 } | |
966 ASSERT(store_position == -1); | |
967 ASSERT(!clear); | |
968 undo_action = RESTORE; | |
969 break; | |
970 case ActionNode::STORE_POSITION: { | |
971 Trace::DeferredCapture* pc = | |
972 static_cast<Trace::DeferredCapture*>(action); | |
973 if (!clear && store_position == -1) { | |
974 store_position = pc->cp_offset(); | |
975 } | |
976 | |
977 // For captures we know that stores and clears alternate. | |
978 // Other register, are never cleared, and if the occur | |
979 // inside a loop, they might be assigned more than once. | |
980 if (reg <= 1) { | |
981 // Registers zero and one, aka "capture zero", is | |
982 // always set correctly if we succeed. There is no | |
983 // need to undo a setting on backtrack, because we | |
984 // will set it again or fail. | |
985 undo_action = IGNORE; | |
986 } else { | |
987 undo_action = pc->is_capture() ? CLEAR : RESTORE; | |
988 } | |
989 ASSERT(!absolute); | |
990 ASSERT(value == 0); | |
991 break; | |
992 } | |
993 case ActionNode::CLEAR_CAPTURES: { | |
994 // Since we're scanning in reverse order, if we've already | |
995 // set the position we have to ignore historically earlier | |
996 // clearing operations. | |
997 if (store_position == -1) { | |
998 clear = true; | |
999 } | |
1000 undo_action = RESTORE; | |
1001 ASSERT(!absolute); | |
1002 ASSERT(value == 0); | |
1003 break; | |
1004 } | |
1005 default: | |
1006 UNREACHABLE(); | |
1007 break; | |
1008 } | |
1009 } | |
1010 } | |
1011 // Prepare for the undo-action (e.g., push if it's going to be popped). | |
1012 if (undo_action == RESTORE) { | |
1013 assembler->PushRegister(reg); | |
1014 registers_to_pop->Set(reg, isolate); | |
1015 } else if (undo_action == CLEAR) { | |
1016 registers_to_clear->Set(reg, isolate); | |
1017 } | |
1018 // Perform the chronologically last action (or accumulated increment) | |
1019 // for the register. | |
1020 if (store_position != -1) { | |
1021 assembler->WriteCurrentPositionToRegister(reg, store_position); | |
1022 } else if (clear) { | |
1023 assembler->ClearRegisters(reg, reg); | |
1024 } else if (absolute) { | |
1025 assembler->SetRegister(reg, value); | |
1026 } else if (value != 0) { | |
1027 assembler->AdvanceRegister(reg, value); | |
1028 } | |
1029 } | |
1030 } | |
1031 | |
1032 | |
1033 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { | |
1034 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | |
1035 | |
1036 ASSERT(!is_trivial()); | |
1037 | |
1038 if (actions_ == NULL && backtrack() == NULL) { | |
1039 // Here we just have some deferred cp advances to fix and we are back to | |
1040 // a normal situation. We may also have to forget some information gained | |
1041 // through a quick check that was already performed. | |
1042 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); | |
1043 // Create a new trivial state and generate the node with that. | |
1044 Trace new_state; | |
1045 successor->Emit(compiler, &new_state); | |
1046 return; | |
1047 } | |
1048 | |
1049 // Generate deferred actions here along with code to undo them again. | |
1050 OutSet affected_registers; | |
1051 | |
1052 if (backtrack() != NULL) { | |
1053 // Here we have a concrete backtrack location. These are set up by choice | |
1054 // nodes and so they indicate that we have a deferred save of the current | |
1055 // position which we may need to emit here. | |
1056 assembler->PushCurrentPosition(); | |
1057 } | |
1058 | |
1059 intptr_t max_register = FindAffectedRegisters(&affected_registers, CI); | |
1060 OutSet registers_to_pop; | |
1061 OutSet registers_to_clear; | |
1062 PerformDeferredActions(assembler, | |
1063 max_register, | |
1064 affected_registers, | |
1065 ®isters_to_pop, | |
1066 ®isters_to_clear, | |
1067 CI); | |
1068 if (cp_offset_ != 0) { | |
1069 assembler->AdvanceCurrentPosition(cp_offset_); | |
1070 } | |
1071 | |
1072 // Create a new trivial state and generate the node with that. | |
1073 BlockLabel undo; | |
1074 assembler->PushBacktrack(&undo); | |
1075 Trace new_state; | |
1076 successor->Emit(compiler, &new_state); | |
1077 | |
1078 // On backtrack we need to restore state. | |
1079 assembler->BindBlock(&undo); | |
1080 RestoreAffectedRegisters(assembler, | |
1081 max_register, | |
1082 registers_to_pop, | |
1083 registers_to_clear); | |
1084 if (backtrack() == NULL) { | |
1085 assembler->Backtrack(); | |
1086 } else { | |
1087 assembler->PopCurrentPosition(); | |
1088 assembler->GoTo(backtrack()); | |
1089 } | |
1090 } | |
1091 | |
1092 | |
1093 void Trace::InvalidateCurrentCharacter() { | |
1094 characters_preloaded_ = 0; | |
1095 } | |
1096 | |
1097 | |
1098 void Trace::AdvanceCurrentPositionInTrace(intptr_t by, | |
1099 RegExpCompiler* compiler) { | |
1100 ASSERT(by > 0); | |
1101 // We don't have an instruction for shifting the current character register | |
1102 // down or for using a shifted value for anything so lets just forget that | |
1103 // we preloaded any characters into it. | |
1104 characters_preloaded_ = 0; | |
1105 // Adjust the offsets of the quick check performed information. This | |
1106 // information is used to find out what we already determined about the | |
1107 // characters by means of mask and compare. | |
1108 quick_check_performed_.Advance(by, compiler->ascii()); | |
1109 cp_offset_ += by; | |
1110 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { | |
1111 compiler->SetRegExpTooBig(); | |
1112 cp_offset_ = 0; | |
1113 } | |
1114 bound_checked_up_to_ = Utils::Maximum(static_cast<intptr_t>(0), | |
1115 bound_checked_up_to_ - by); | |
1116 } | |
1117 | |
1118 | |
1119 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { | |
1120 if (!trace->is_trivial()) { | |
1121 trace->Flush(compiler, this); | |
1122 return; | |
1123 } | |
1124 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | |
1125 if (!label()->IsBound()) { | |
1126 assembler->BindBlock(label()); | |
1127 } | |
1128 switch (action_) { | |
1129 case ACCEPT: | |
1130 assembler->Succeed(); | |
1131 return; | |
1132 case BACKTRACK: | |
1133 assembler->GoTo(trace->backtrack()); | |
1134 return; | |
1135 case NEGATIVE_SUBMATCH_SUCCESS: | |
1136 // This case is handled in a different virtual method. | |
1137 UNREACHABLE(); | |
1138 } | |
1139 UNIMPLEMENTED(); | |
1140 } | |
1141 | |
1142 | |
1143 bool QuickCheckDetails::Rationalize(bool asc) { | |
1144 bool found_useful_op = false; | |
1145 uint32_t char_mask; | |
1146 if (asc) { | |
1147 char_mask = Symbols::kMaxOneCharCodeSymbol; | |
1148 } else { | |
1149 char_mask = Utf16::kMaxCodeUnit; | |
1150 } | |
1151 mask_ = 0; | |
1152 value_ = 0; | |
1153 intptr_t char_shift = 0; | |
1154 for (intptr_t i = 0; i < characters_; i++) { | |
1155 Position* pos = &positions_[i]; | |
1156 if ((pos->mask & Symbols::kMaxOneCharCodeSymbol) != 0) { | |
1157 found_useful_op = true; | |
1158 } | |
1159 mask_ |= (pos->mask & char_mask) << char_shift; | |
1160 value_ |= (pos->value & char_mask) << char_shift; | |
1161 char_shift += asc ? 8 : 16; | |
1162 } | |
1163 return found_useful_op; | |
1164 } | |
1165 | |
1166 | |
1167 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, | |
1168 Trace* trace, | |
1169 bool preload_has_checked_bounds, | |
1170 BlockLabel* on_possible_success, | |
1171 QuickCheckDetails* details, | |
1172 bool fall_through_on_failure) { | |
1173 if (details->characters() == 0) return false; | |
1174 GetQuickCheckDetails( | |
1175 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE); | |
1176 if (details->cannot_match()) return false; | |
1177 if (!details->Rationalize(compiler->ascii())) return false; | |
1178 ASSERT(details->characters() == 1 || | |
1179 compiler->macro_assembler()->CanReadUnaligned()); | |
1180 uint32_t mask = details->mask(); | |
1181 uint32_t value = details->value(); | |
1182 | |
1183 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | |
1184 | |
1185 if (trace->characters_preloaded() != details->characters()) { | |
1186 assembler->LoadCurrentCharacter(trace->cp_offset(), | |
1187 trace->backtrack(), | |
1188 !preload_has_checked_bounds, | |
1189 details->characters()); | |
1190 } | |
1191 | |
1192 | |
1193 bool need_mask = true; | |
1194 | |
1195 if (details->characters() == 1) { | |
1196 // If number of characters preloaded is 1 then we used a byte or 16 bit | |
1197 // load so the value is already masked down. | |
1198 uint32_t char_mask; | |
1199 if (compiler->ascii()) { | |
1200 char_mask = Symbols::kMaxOneCharCodeSymbol; | |
1201 } else { | |
1202 char_mask = Utf16::kMaxCodeUnit; | |
1203 } | |
1204 if ((mask & char_mask) == char_mask) need_mask = false; | |
1205 mask &= char_mask; | |
1206 } else { | |
1207 // For 2-character preloads in ASCII mode or 1-character preloads in | |
1208 // TWO_BYTE mode we also use a 16 bit load with zero extend. | |
1209 if (details->characters() == 2 && compiler->ascii()) { | |
1210 if ((mask & 0xffff) == 0xffff) need_mask = false; | |
1211 } else if (details->characters() == 1 && !compiler->ascii()) { | |
1212 if ((mask & 0xffff) == 0xffff) need_mask = false; | |
1213 } else { | |
1214 if (mask == 0xffffffff) need_mask = false; | |
1215 } | |
1216 } | |
1217 | |
1218 if (fall_through_on_failure) { | |
1219 if (need_mask) { | |
1220 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); | |
1221 } else { | |
1222 assembler->CheckCharacter(value, on_possible_success); | |
1223 } | |
1224 } else { | |
1225 if (need_mask) { | |
1226 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); | |
1227 } else { | |
1228 assembler->CheckNotCharacter(value, trace->backtrack()); | |
1229 } | |
1230 } | |
1231 return true; | |
1232 } | |
1233 | |
1234 | |
1235 // Emit the code to check for a ^ in multiline mode (1-character lookbehind | |
1236 // that matches newline or the start of input). | |
1237 static void EmitHat(RegExpCompiler* compiler, | |
1238 RegExpNode* on_success, | |
1239 Trace* trace) { | |
1240 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | |
1241 // We will be loading the previous character into the current character | |
1242 // register. | |
1243 Trace new_trace(*trace); | |
1244 new_trace.InvalidateCurrentCharacter(); | |
1245 | |
1246 BlockLabel ok; | |
1247 if (new_trace.cp_offset() == 0) { | |
1248 // The start of input counts as a newline in this context, so skip to | |
1249 // ok if we are at the start. | |
1250 assembler->CheckAtStart(&ok); | |
1251 } | |
1252 // We already checked that we are not at the start of input so it must be | |
1253 // OK to load the previous character. | |
1254 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, | |
1255 new_trace.backtrack(), | |
1256 false); | |
1257 if (!assembler->CheckSpecialCharacterClass('n', | |
1258 new_trace.backtrack())) { | |
1259 // Newline means \n, \r, 0x2028 or 0x2029. | |
1260 if (!compiler->ascii()) { | |
1261 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); | |
1262 } | |
1263 assembler->CheckCharacter('\n', &ok); | |
1264 assembler->CheckNotCharacter('\r', new_trace.backtrack()); | |
1265 } | |
1266 assembler->BindBlock(&ok); | |
1267 on_success->Emit(compiler, &new_trace); | |
1268 } | |
1269 | |
1270 | |
1271 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { | |
1272 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | |
1273 switch (assertion_type_) { | |
1274 case AT_END: { | |
1275 BlockLabel ok; | |
1276 assembler->CheckPosition(trace->cp_offset(), &ok); | |
1277 assembler->GoTo(trace->backtrack()); | |
1278 assembler->BindBlock(&ok); | |
1279 break; | |
1280 } | |
1281 case AT_START: { | |
1282 if (trace->at_start() == Trace::FALSE_VALUE) { | |
1283 assembler->GoTo(trace->backtrack()); | |
1284 return; | |
1285 } | |
1286 if (trace->at_start() == Trace::UNKNOWN) { | |
1287 assembler->CheckNotAtStart(trace->backtrack()); | |
1288 Trace at_start_trace = *trace; | |
1289 at_start_trace.set_at_start(true); | |
1290 on_success()->Emit(compiler, &at_start_trace); | |
1291 return; | |
1292 } | |
1293 } | |
1294 break; | |
1295 case AFTER_NEWLINE: | |
1296 EmitHat(compiler, on_success(), trace); | |
1297 return; | |
1298 case AT_BOUNDARY: | |
1299 case AT_NON_BOUNDARY: { | |
1300 EmitBoundaryCheck(compiler, trace); | |
1301 return; | |
1302 } | |
1303 } | |
1304 on_success()->Emit(compiler, trace); | |
1305 } | |
1306 | |
1307 | |
1308 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, | |
1309 Trace* trace) { | |
1310 // If we are generating a greedy loop then don't stop and don't reuse code. | |
1311 if (trace->stop_node() != NULL) { | |
1312 return CONTINUE; | |
1313 } | |
1314 | |
1315 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); | |
1316 if (trace->is_trivial()) { | |
1317 if (label_.IsBound()) { | |
1318 // We are being asked to generate a generic version, but that's already | |
1319 // been done so just go to it. | |
1320 macro_assembler->GoTo(&label_); | |
1321 return DONE; | |
1322 } | |
1323 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) { | |
1324 // To avoid too deep recursion we push the node to the work queue and just | |
1325 // generate a goto here. | |
1326 compiler->AddWork(this); | |
1327 macro_assembler->GoTo(&label_); | |
1328 return DONE; | |
1329 } | |
1330 // Generate generic version of the node and bind the label for later use. | |
1331 macro_assembler->BindBlock(&label_); | |
1332 return CONTINUE; | |
1333 } | |
1334 | |
1335 // We are being asked to make a non-generic version. Keep track of how many | |
1336 // non-generic versions we generate so as not to overdo it. | |
1337 trace_count_++; | |
1338 if (kRegexpOptimization && | |
1339 trace_count_ < kMaxCopiesCodeGenerated && | |
1340 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) { | |
1341 return CONTINUE; | |
1342 } | |
1343 | |
1344 // If we get here code has been generated for this node too many times or | |
1345 // recursion is too deep. Time to switch to a generic version. The code for | |
1346 // generic versions above can handle deep recursion properly. | |
1347 trace->Flush(compiler, this); | |
1348 return DONE; | |
1349 } | |
1350 | |
1351 | |
1352 // This generates the code to match a text node. A text node can contain | |
1353 // straight character sequences (possibly to be matched in a case-independent | |
1354 // way) and character classes. For efficiency we do not do this in a single | |
1355 // pass from left to right. Instead we pass over the text node several times, | |
1356 // emitting code for some character positions every time. See the comment on | |
1357 // TextEmitPass for details. | |
1358 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { | |
1359 LimitResult limit_result = LimitVersions(compiler, trace); | |
1360 if (limit_result == DONE) return; | |
1361 ASSERT(limit_result == CONTINUE); | |
1362 | |
1363 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { | |
1364 compiler->SetRegExpTooBig(); | |
1365 return; | |
1366 } | |
1367 | |
1368 if (compiler->ascii()) { | |
1369 intptr_t dummy = 0; | |
1370 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); | |
1371 } | |
1372 | |
1373 bool first_elt_done = false; | |
1374 intptr_t bound_checked_to = trace->cp_offset() - 1; | |
1375 bound_checked_to += trace->bound_checked_up_to(); | |
1376 | |
1377 // If a character is preloaded into the current character register then | |
1378 // check that now. | |
1379 if (trace->characters_preloaded() == 1) { | |
1380 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) { | |
1381 if (!SkipPass(pass, compiler->ignore_case())) { | |
1382 TextEmitPass(compiler, | |
1383 static_cast<TextEmitPassType>(pass), | |
1384 true, | |
1385 trace, | |
1386 false, | |
1387 &bound_checked_to); | |
1388 } | |
1389 } | |
1390 first_elt_done = true; | |
1391 } | |
1392 | |
1393 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) { | |
1394 if (!SkipPass(pass, compiler->ignore_case())) { | |
1395 TextEmitPass(compiler, | |
1396 static_cast<TextEmitPassType>(pass), | |
1397 false, | |
1398 trace, | |
1399 first_elt_done, | |
1400 &bound_checked_to); | |
1401 } | |
1402 } | |
1403 | |
1404 Trace successor_trace(*trace); | |
1405 successor_trace.set_at_start(false); | |
1406 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); | |
1407 RecursionCheck rc(compiler); | |
1408 on_success()->Emit(compiler, &successor_trace); | |
1409 } | |
1410 | |
1411 | |
1412 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { | |
1413 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); | |
1414 if (trace->stop_node() == this) { | |
1415 intptr_t text_length = | |
1416 GreedyLoopTextLengthForAlternative(&((*alternatives_)[0])); | |
1417 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops); | |
1418 // Update the counter-based backtracking info on the stack. This is an | |
1419 // optimization for greedy loops (see below). | |
1420 ASSERT(trace->cp_offset() == text_length); | |
1421 macro_assembler->AdvanceCurrentPosition(text_length); | |
1422 macro_assembler->GoTo(trace->loop_label()); | |
1423 return; | |
1424 } | |
1425 ASSERT(trace->stop_node() == NULL); | |
1426 if (!trace->is_trivial()) { | |
1427 trace->Flush(compiler, this); | |
1428 return; | |
1429 } | |
1430 ChoiceNode::Emit(compiler, trace); | |
1431 } | |
1432 | |
1433 | |
1434 // This class is used when generating the alternatives in a choice node. It | |
1435 // records the way the alternative is being code generated. | |
1436 struct AlternativeGeneration { | |
1437 AlternativeGeneration() | |
1438 : possible_success(), | |
1439 expects_preload(false), | |
1440 after(), | |
1441 quick_check_details() { } | |
1442 BlockLabel possible_success; | |
1443 bool expects_preload; | |
1444 BlockLabel after; | |
1445 QuickCheckDetails quick_check_details; | |
1446 }; | |
1447 | |
1448 | |
1449 // Creates a list of AlternativeGenerations. If the list has a reasonable | |
1450 // size then it is on the stack, otherwise the excess is on the heap. | |
1451 class AlternativeGenerationList { | |
1452 public: | |
1453 explicit AlternativeGenerationList(intptr_t count) | |
1454 : alt_gens_(count) { | |
1455 for (intptr_t i = 0; i < count && i < kAFew; i++) { | |
1456 alt_gens_.Add(a_few_alt_gens_ + i); | |
1457 } | |
1458 for (intptr_t i = kAFew; i < count; i++) { | |
1459 alt_gens_.Add(new AlternativeGeneration()); | |
1460 } | |
1461 } | |
1462 ~AlternativeGenerationList() { | |
1463 for (intptr_t i = kAFew; i < alt_gens_.length(); i++) { | |
1464 delete alt_gens_[i]; | |
1465 alt_gens_[i] = NULL; | |
1466 } | |
1467 } | |
1468 | |
1469 AlternativeGeneration* at(intptr_t i) { | |
1470 return alt_gens_[i]; | |
1471 } | |
1472 | |
1473 private: | |
1474 static const intptr_t kAFew = 10; | |
1475 GrowableArray<AlternativeGeneration*> alt_gens_; | |
1476 AlternativeGeneration a_few_alt_gens_[kAFew]; | |
1477 | |
1478 DISALLOW_ALLOCATION(); | |
1479 }; | |
1480 | |
1481 | |
1482 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { | |
1483 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); | |
1484 intptr_t choice_count = alternatives_->length(); | |
1485 | |
1486 #ifdef DEBUG | |
1487 for (intptr_t i = 0; i < choice_count - 1; i++) { | |
1488 GuardedAlternative alternative = alternatives_->At(i); | |
1489 ZoneGrowableArray<Guard*>* guards = alternative.guards(); | |
1490 intptr_t guard_count = (guards == NULL) ? 0 : guards->length(); | |
1491 for (intptr_t j = 0; j < guard_count; j++) { | |
1492 ASSERT(!trace->mentions_reg(guards->At(j)->reg())); | |
1493 } | |
1494 } | |
1495 #endif | |
1496 | |
1497 LimitResult limit_result = LimitVersions(compiler, trace); | |
1498 if (limit_result == DONE) return; | |
1499 ASSERT(limit_result == CONTINUE); | |
1500 | |
1501 intptr_t new_flush_budget = trace->flush_budget() / choice_count; | |
1502 if (trace->flush_budget() == 0 && trace->actions() != NULL) { | |
1503 trace->Flush(compiler, this); | |
1504 return; | |
1505 } | |
1506 | |
1507 RecursionCheck rc(compiler); | |
1508 | |
1509 Trace* current_trace = trace; | |
1510 | |
1511 intptr_t text_length = | |
1512 GreedyLoopTextLengthForAlternative(&((*alternatives_)[0])); | |
1513 bool greedy_loop = false; | |
1514 BlockLabel greedy_loop_label; | |
1515 Trace counter_backtrack_trace; | |
1516 counter_backtrack_trace.set_backtrack(&greedy_loop_label); | |
1517 if (not_at_start()) counter_backtrack_trace.set_at_start(false); | |
1518 | |
1519 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { | |
1520 // Here we have special handling for greedy loops containing only text nodes | |
1521 // and other simple nodes. These are handled by pushing the current | |
1522 // position on the stack and then incrementing the current position each | |
1523 // time around the switch. On backtrack we decrement the current position | |
1524 // and check it against the pushed value. This avoids pushing backtrack | |
1525 // information for each iteration of the loop, which could take up a lot of | |
1526 // space. | |
1527 greedy_loop = true; | |
1528 ASSERT(trace->stop_node() == NULL); | |
1529 macro_assembler->PushCurrentPosition(); | |
1530 current_trace = &counter_backtrack_trace; | |
1531 BlockLabel greedy_match_failed; | |
1532 Trace greedy_match_trace; | |
1533 if (not_at_start()) greedy_match_trace.set_at_start(false); | |
1534 greedy_match_trace.set_backtrack(&greedy_match_failed); | |
1535 BlockLabel loop_label; | |
1536 macro_assembler->BindBlock(&loop_label); | |
1537 greedy_match_trace.set_stop_node(this); | |
1538 greedy_match_trace.set_loop_label(&loop_label); | |
1539 (*alternatives_)[0].node()->Emit(compiler, &greedy_match_trace); | |
1540 macro_assembler->BindBlock(&greedy_match_failed); | |
1541 } | |
1542 | |
1543 BlockLabel second_choice; // For use in greedy matches. | |
1544 macro_assembler->BindBlock(&second_choice); | |
1545 | |
1546 intptr_t first_normal_choice = greedy_loop ? 1 : 0; | |
1547 | |
1548 bool not_at_start = current_trace->at_start() == Trace::FALSE_VALUE; | |
1549 const intptr_t kEatsAtLeastNotYetInitialized = -1; | |
1550 intptr_t eats_at_least = kEatsAtLeastNotYetInitialized; | |
1551 | |
1552 bool skip_was_emitted = false; | |
1553 | |
1554 if (!greedy_loop && choice_count == 2) { | |
1555 GuardedAlternative alt1 = (*alternatives_)[1]; | |
1556 if (alt1.guards() == NULL || alt1.guards()->length() == 0) { | |
1557 RegExpNode* eats_anything_node = alt1.node(); | |
1558 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == | |
1559 this) { | |
1560 // At this point we know that we are at a non-greedy loop that will eat | |
1561 // any character one at a time. Any non-anchored regexp has such a | |
1562 // loop prepended to it in order to find where it starts. We look for | |
1563 // a pattern of the form ...abc... where we can look 6 characters ahead | |
1564 // and step forwards 3 if the character is not one of abc. Abc need | |
1565 // not be atoms, they can be any reasonably limited character class or | |
1566 // small alternation. | |
1567 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. | |
1568 BoyerMooreLookahead* lookahead = bm_info(not_at_start); | |
1569 if (lookahead == NULL) { | |
1570 eats_at_least = Utils::Minimum(kMaxLookaheadForBoyerMoore, | |
1571 EatsAtLeast(kMaxLookaheadForBoyerMoore, | |
1572 kRecursionBudget, | |
1573 not_at_start)); | |
1574 if (eats_at_least >= 1) { | |
1575 BoyerMooreLookahead* bm = | |
1576 new(I) BoyerMooreLookahead(eats_at_least, compiler, I); | |
1577 GuardedAlternative alt0 = alternatives_->At(0); | |
1578 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, not_at_start); | |
1579 skip_was_emitted = bm->EmitSkipInstructions(macro_assembler); | |
1580 } | |
1581 } else { | |
1582 skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler); | |
1583 } | |
1584 } | |
1585 } | |
1586 } | |
1587 | |
1588 if (eats_at_least == kEatsAtLeastNotYetInitialized) { | |
1589 // Save some time by looking at most one machine word ahead. | |
1590 eats_at_least = | |
1591 EatsAtLeast(compiler->ascii() ? 4 : 2, kRecursionBudget, not_at_start); | |
1592 } | |
1593 intptr_t preload_characters = | |
1594 CalculatePreloadCharacters(compiler, eats_at_least); | |
1595 | |
1596 bool preload_is_current = !skip_was_emitted && | |
1597 (current_trace->characters_preloaded() == preload_characters); | |
1598 bool preload_has_checked_bounds = preload_is_current; | |
1599 | |
1600 AlternativeGenerationList alt_gens(choice_count); | |
1601 | |
1602 // For now we just call all choices one after the other. The idea ultimately | |
1603 // is to use the Dispatch table to try only the relevant ones. | |
1604 for (intptr_t i = first_normal_choice; i < choice_count; i++) { | |
1605 GuardedAlternative alternative = alternatives_->At(i); | |
1606 AlternativeGeneration* alt_gen = alt_gens.at(i); | |
1607 alt_gen->quick_check_details.set_characters(preload_characters); | |
1608 ZoneGrowableArray<Guard*>* guards = alternative.guards(); | |
1609 intptr_t guard_count = (guards == NULL) ? 0 : guards->length(); | |
1610 Trace new_trace(*current_trace); | |
1611 new_trace.set_characters_preloaded(preload_is_current ? | |
1612 preload_characters : | |
1613 0); | |
1614 if (preload_has_checked_bounds) { | |
1615 new_trace.set_bound_checked_up_to(preload_characters); | |
1616 } | |
1617 new_trace.quick_check_performed()->Clear(); | |
1618 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE); | |
1619 alt_gen->expects_preload = preload_is_current; | |
1620 bool generate_full_check_inline = false; | |
1621 if (kRegexpOptimization && | |
1622 try_to_emit_quick_check_for_alternative(i) && | |
1623 alternative.node()->EmitQuickCheck(compiler, | |
1624 &new_trace, | |
1625 preload_has_checked_bounds, | |
1626 &alt_gen->possible_success, | |
1627 &alt_gen->quick_check_details, | |
1628 i < choice_count - 1)) { | |
1629 // Quick check was generated for this choice. | |
1630 preload_is_current = true; | |
1631 preload_has_checked_bounds = true; | |
1632 // On the last choice in the ChoiceNode we generated the quick | |
1633 // check to fall through on possible success. So now we need to | |
1634 // generate the full check inline. | |
1635 if (i == choice_count - 1) { | |
1636 macro_assembler->BindBlock(&alt_gen->possible_success); | |
1637 new_trace.set_quick_check_performed(&alt_gen->quick_check_details); | |
1638 new_trace.set_characters_preloaded(preload_characters); | |
1639 new_trace.set_bound_checked_up_to(preload_characters); | |
1640 generate_full_check_inline = true; | |
1641 } | |
1642 } else if (alt_gen->quick_check_details.cannot_match()) { | |
1643 if (i == choice_count - 1 && !greedy_loop) { | |
1644 macro_assembler->GoTo(trace->backtrack()); | |
1645 } | |
1646 continue; | |
1647 } else { | |
1648 // No quick check was generated. Put the full code here. | |
1649 // If this is not the first choice then there could be slow checks from | |
1650 // previous cases that go here when they fail. There's no reason to | |
1651 // insist that they preload characters since the slow check we are about | |
1652 // to generate probably can't use it. | |
1653 if (i != first_normal_choice) { | |
1654 alt_gen->expects_preload = false; | |
1655 new_trace.InvalidateCurrentCharacter(); | |
1656 } | |
1657 if (i < choice_count - 1) { | |
1658 new_trace.set_backtrack(&alt_gen->after); | |
1659 } | |
1660 generate_full_check_inline = true; | |
1661 } | |
1662 if (generate_full_check_inline) { | |
1663 if (new_trace.actions() != NULL) { | |
1664 new_trace.set_flush_budget(new_flush_budget); | |
1665 } | |
1666 for (intptr_t j = 0; j < guard_count; j++) { | |
1667 GenerateGuard(macro_assembler, guards->At(j), &new_trace); | |
1668 } | |
1669 alternative.node()->Emit(compiler, &new_trace); | |
1670 preload_is_current = false; | |
1671 } | |
1672 macro_assembler->BindBlock(&alt_gen->after); | |
1673 } | |
1674 if (greedy_loop) { | |
1675 macro_assembler->BindBlock(&greedy_loop_label); | |
1676 // If we have unwound to the bottom then backtrack. | |
1677 macro_assembler->CheckGreedyLoop(trace->backtrack()); | |
1678 // Otherwise try the second priority at an earlier position. | |
1679 macro_assembler->AdvanceCurrentPosition(-text_length); | |
1680 macro_assembler->GoTo(&second_choice); | |
1681 } | |
1682 | |
1683 // At this point we need to generate slow checks for the alternatives where | |
1684 // the quick check was inlined. We can recognize these because the associated | |
1685 // label was bound. | |
1686 for (intptr_t i = first_normal_choice; i < choice_count - 1; i++) { | |
1687 AlternativeGeneration* alt_gen = alt_gens.at(i); | |
1688 Trace new_trace(*current_trace); | |
1689 // If there are actions to be flushed we have to limit how many times | |
1690 // they are flushed. Take the budget of the parent trace and distribute | |
1691 // it fairly amongst the children. | |
1692 if (new_trace.actions() != NULL) { | |
1693 new_trace.set_flush_budget(new_flush_budget); | |
1694 } | |
1695 EmitOutOfLineContinuation(compiler, | |
1696 &new_trace, | |
1697 alternatives_->At(i), | |
1698 alt_gen, | |
1699 preload_characters, | |
1700 alt_gens.at(i + 1)->expects_preload); | |
1701 } | |
1702 } | |
1703 | |
1704 | |
1705 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, | |
1706 Trace* trace, | |
1707 GuardedAlternative alternative, | |
1708 AlternativeGeneration* alt_gen, | |
1709 intptr_t preload_characters, | |
1710 bool next_expects_preload) { | |
1711 if (!alt_gen->possible_success.IsLinked()) return; | |
1712 | |
1713 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); | |
1714 macro_assembler->BindBlock(&alt_gen->possible_success); | |
1715 Trace out_of_line_trace(*trace); | |
1716 out_of_line_trace.set_characters_preloaded(preload_characters); | |
1717 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); | |
1718 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE); | |
1719 ZoneGrowableArray<Guard*>* guards = alternative.guards(); | |
1720 intptr_t guard_count = (guards == NULL) ? 0 : guards->length(); | |
1721 if (next_expects_preload) { | |
1722 BlockLabel reload_current_char; | |
1723 out_of_line_trace.set_backtrack(&reload_current_char); | |
1724 for (intptr_t j = 0; j < guard_count; j++) { | |
1725 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace); | |
1726 } | |
1727 alternative.node()->Emit(compiler, &out_of_line_trace); | |
1728 macro_assembler->BindBlock(&reload_current_char); | |
1729 // Reload the current character, since the next quick check expects that. | |
1730 // We don't need to check bounds here because we only get into this | |
1731 // code through a quick check which already did the checked load. | |
1732 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), | |
1733 NULL, | |
1734 false, | |
1735 preload_characters); | |
1736 macro_assembler->GoTo(&(alt_gen->after)); | |
1737 } else { | |
1738 out_of_line_trace.set_backtrack(&(alt_gen->after)); | |
1739 for (intptr_t j = 0; j < guard_count; j++) { | |
1740 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace); | |
1741 } | |
1742 alternative.node()->Emit(compiler, &out_of_line_trace); | |
1743 } | |
1744 } | |
1745 | |
1746 | |
1747 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { | |
1748 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | |
1749 LimitResult limit_result = LimitVersions(compiler, trace); | |
1750 if (limit_result == DONE) return; | |
1751 ASSERT(limit_result == CONTINUE); | |
1752 | |
1753 RecursionCheck rc(compiler); | |
1754 | |
1755 switch (action_type_) { | |
1756 case STORE_POSITION: { | |
1757 Trace::DeferredCapture | |
1758 new_capture(data_.u_position_register.reg, | |
1759 data_.u_position_register.is_capture, | |
1760 trace); | |
1761 Trace new_trace = *trace; | |
1762 new_trace.add_action(&new_capture); | |
1763 on_success()->Emit(compiler, &new_trace); | |
1764 break; | |
1765 } | |
1766 case INCREMENT_REGISTER: { | |
1767 Trace::DeferredIncrementRegister | |
1768 new_increment(data_.u_increment_register.reg); | |
1769 Trace new_trace = *trace; | |
1770 new_trace.add_action(&new_increment); | |
1771 on_success()->Emit(compiler, &new_trace); | |
1772 break; | |
1773 } | |
1774 case SET_REGISTER: { | |
1775 Trace::DeferredSetRegister | |
1776 new_set(data_.u_store_register.reg, data_.u_store_register.value); | |
1777 Trace new_trace = *trace; | |
1778 new_trace.add_action(&new_set); | |
1779 on_success()->Emit(compiler, &new_trace); | |
1780 break; | |
1781 } | |
1782 case CLEAR_CAPTURES: { | |
1783 Trace::DeferredClearCaptures | |
1784 new_capture(Interval(data_.u_clear_captures.range_from, | |
1785 data_.u_clear_captures.range_to)); | |
1786 Trace new_trace = *trace; | |
1787 new_trace.add_action(&new_capture); | |
1788 on_success()->Emit(compiler, &new_trace); | |
1789 break; | |
1790 } | |
1791 case BEGIN_SUBMATCH: | |
1792 if (!trace->is_trivial()) { | |
1793 trace->Flush(compiler, this); | |
1794 } else { | |
1795 assembler->WriteCurrentPositionToRegister( | |
1796 data_.u_submatch.current_position_register, 0); | |
1797 assembler->WriteStackPointerToRegister( | |
1798 data_.u_submatch.stack_pointer_register); | |
1799 on_success()->Emit(compiler, trace); | |
1800 } | |
1801 break; | |
1802 case EMPTY_MATCH_CHECK: { | |
1803 intptr_t start_pos_reg = data_.u_empty_match_check.start_register; | |
1804 intptr_t stored_pos = 0; | |
1805 intptr_t rep_reg = data_.u_empty_match_check.repetition_register; | |
1806 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); | |
1807 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); | |
1808 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { | |
1809 // If we know we haven't advanced and there is no minimum we | |
1810 // can just backtrack immediately. | |
1811 assembler->GoTo(trace->backtrack()); | |
1812 } else if (know_dist && stored_pos < trace->cp_offset()) { | |
1813 // If we know we've advanced we can generate the continuation | |
1814 // immediately. | |
1815 on_success()->Emit(compiler, trace); | |
1816 } else if (!trace->is_trivial()) { | |
1817 trace->Flush(compiler, this); | |
1818 } else { | |
1819 BlockLabel skip_empty_check; | |
1820 // If we have a minimum number of repetitions we check the current | |
1821 // number first and skip the empty check if it's not enough. | |
1822 if (has_minimum) { | |
1823 intptr_t limit = data_.u_empty_match_check.repetition_limit; | |
1824 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); | |
1825 } | |
1826 // If the match is empty we bail out, otherwise we fall through | |
1827 // to the on-success continuation. | |
1828 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, | |
1829 trace->backtrack()); | |
1830 assembler->BindBlock(&skip_empty_check); | |
1831 on_success()->Emit(compiler, trace); | |
1832 } | |
1833 break; | |
1834 } | |
1835 case POSITIVE_SUBMATCH_SUCCESS: { | |
1836 if (!trace->is_trivial()) { | |
1837 trace->Flush(compiler, this); | |
1838 return; | |
1839 } | |
1840 assembler->ReadCurrentPositionFromRegister( | |
1841 data_.u_submatch.current_position_register); | |
1842 assembler->ReadStackPointerFromRegister( | |
1843 data_.u_submatch.stack_pointer_register); | |
1844 intptr_t clear_register_count = data_.u_submatch.clear_register_count; | |
1845 if (clear_register_count == 0) { | |
1846 on_success()->Emit(compiler, trace); | |
1847 return; | |
1848 } | |
1849 intptr_t clear_registers_from = data_.u_submatch.clear_register_from; | |
1850 BlockLabel clear_registers_backtrack; | |
1851 Trace new_trace = *trace; | |
1852 new_trace.set_backtrack(&clear_registers_backtrack); | |
1853 on_success()->Emit(compiler, &new_trace); | |
1854 | |
1855 assembler->BindBlock(&clear_registers_backtrack); | |
1856 intptr_t clear_registers_to = | |
1857 clear_registers_from + clear_register_count - 1; | |
1858 assembler->ClearRegisters(clear_registers_from, clear_registers_to); | |
1859 | |
1860 ASSERT(trace->backtrack() == NULL); | |
1861 assembler->Backtrack(); | |
1862 return; | |
1863 } | |
1864 default: | |
1865 UNREACHABLE(); | |
1866 } | |
1867 } | |
1868 | |
1869 | |
1870 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { | |
1871 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | |
1872 if (!trace->is_trivial()) { | |
1873 trace->Flush(compiler, this); | |
1874 return; | |
1875 } | |
1876 | |
1877 LimitResult limit_result = LimitVersions(compiler, trace); | |
1878 if (limit_result == DONE) return; | |
1879 ASSERT(limit_result == CONTINUE); | |
1880 | |
1881 RecursionCheck rc(compiler); | |
1882 | |
1883 ASSERT(start_reg_ + 1 == end_reg_); | |
1884 if (compiler->ignore_case()) { | |
1885 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, | |
1886 trace->backtrack()); | |
1887 } else { | |
1888 assembler->CheckNotBackReference(start_reg_, trace->backtrack()); | |
1889 } | |
1890 on_success()->Emit(compiler, trace); | |
1891 } | |
1892 | |
1893 | |
1894 void ActionNode::FillInBMInfo(intptr_t offset, | |
1895 intptr_t budget, | |
1896 BoyerMooreLookahead* bm, | |
1897 bool not_at_start) { | |
1898 if (action_type_ == BEGIN_SUBMATCH) { | |
1899 bm->SetRest(offset); | |
1900 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) { | |
1901 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start); | |
1902 } | |
1903 SaveBMInfo(bm, not_at_start, offset); | |
1904 } | |
1905 | |
1906 | |
1907 void AssertionNode::FillInBMInfo(intptr_t offset, | |
1908 intptr_t budget, | |
1909 BoyerMooreLookahead* bm, | |
1910 bool not_at_start) { | |
1911 // Match the behaviour of EatsAtLeast on this node. | |
1912 if (assertion_type() == AT_START && not_at_start) return; | |
1913 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start); | |
1914 SaveBMInfo(bm, not_at_start, offset); | |
1915 } | |
1916 | |
1917 | |
1918 void BackReferenceNode::FillInBMInfo(intptr_t offset, | |
1919 intptr_t budget, | |
1920 BoyerMooreLookahead* bm, | |
1921 bool not_at_start) { | |
1922 // Working out the set of characters that a backreference can match is too | |
1923 // hard, so we just say that any character can match. | |
1924 bm->SetRest(offset); | |
1925 SaveBMInfo(bm, not_at_start, offset); | |
1926 } | |
1927 | |
1928 | |
1929 // Returns the number of characters in the equivalence class, omitting those | |
1930 // that cannot occur in the source string because it is ASCII. | |
1931 static intptr_t GetCaseIndependentLetters(uint16_t character, | |
1932 bool ascii_subject, | |
1933 int32_t* letters) { | |
1934 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize; | |
1935 intptr_t length = jsregexp_uncanonicalize.get(character, '\0', letters); | |
1936 // Unibrow returns 0 or 1 for characters where case independence is | |
1937 // trivial. | |
1938 if (length == 0) { | |
1939 letters[0] = character; | |
1940 length = 1; | |
1941 } | |
1942 if (!ascii_subject || character <= Symbols::kMaxOneCharCodeSymbol) { | |
1943 return length; | |
1944 } | |
1945 // The standard requires that non-ASCII characters cannot have ASCII | |
1946 // character codes in their equivalence class. | |
1947 return 0; | |
1948 } | |
1949 | |
1950 | |
1951 void ChoiceNode::FillInBMInfo(intptr_t offset, | |
1952 intptr_t budget, | |
1953 BoyerMooreLookahead* bm, | |
1954 bool not_at_start) { | |
1955 ZoneGrowableArray<GuardedAlternative>* alts = alternatives(); | |
1956 budget = (budget - 1) / alts->length(); | |
1957 for (intptr_t i = 0; i < alts->length(); i++) { | |
1958 GuardedAlternative& alt = (*alts)[i]; | |
1959 if (alt.guards() != NULL && alt.guards()->length() != 0) { | |
1960 bm->SetRest(offset); // Give up trying to fill in info. | |
1961 SaveBMInfo(bm, not_at_start, offset); | |
1962 return; | |
1963 } | |
1964 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start); | |
1965 } | |
1966 SaveBMInfo(bm, not_at_start, offset); | |
1967 } | |
1968 | |
1969 | |
1970 void EndNode::FillInBMInfo(intptr_t offset, | |
1971 intptr_t budget, | |
1972 BoyerMooreLookahead* bm, | |
1973 bool not_at_start) { | |
1974 // Returning 0 from EatsAtLeast should ensure we never get here. | |
1975 UNREACHABLE(); | |
1976 } | |
1977 | |
1978 | |
1979 void LoopChoiceNode::FillInBMInfo(intptr_t offset, | |
1980 intptr_t budget, | |
1981 BoyerMooreLookahead* bm, | |
1982 bool not_at_start) { | |
1983 if (body_can_be_zero_length_ || budget <= 0) { | |
1984 bm->SetRest(offset); | |
1985 SaveBMInfo(bm, not_at_start, offset); | |
1986 return; | |
1987 } | |
1988 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start); | |
1989 SaveBMInfo(bm, not_at_start, offset); | |
1990 } | |
1991 | |
1992 | |
1993 void TextNode::FillInBMInfo(intptr_t initial_offset, | |
1994 intptr_t budget, | |
1995 BoyerMooreLookahead* bm, | |
1996 bool not_at_start) { | |
1997 if (initial_offset >= bm->length()) return; | |
1998 intptr_t offset = initial_offset; | |
1999 intptr_t max_char = bm->max_char(); | |
2000 for (intptr_t i = 0; i < elements()->length(); i++) { | |
2001 if (offset >= bm->length()) { | |
2002 if (initial_offset == 0) set_bm_info(not_at_start, bm); | |
2003 return; | |
2004 } | |
2005 TextElement text = elements()->At(i); | |
2006 if (text.text_type() == TextElement::ATOM) { | |
2007 RegExpAtom* atom = text.atom(); | |
2008 for (intptr_t j = 0; j < atom->length(); j++, offset++) { | |
2009 if (offset >= bm->length()) { | |
2010 if (initial_offset == 0) set_bm_info(not_at_start, bm); | |
2011 return; | |
2012 } | |
2013 uint16_t character = atom->data()->At(j); | |
2014 if (bm->compiler()->ignore_case()) { | |
2015 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | |
2016 intptr_t length = GetCaseIndependentLetters( | |
2017 character, | |
2018 bm->max_char() == Symbols::kMaxOneCharCodeSymbol, | |
2019 chars); | |
2020 for (intptr_t j = 0; j < length; j++) { | |
2021 bm->Set(offset, chars[j]); | |
2022 } | |
2023 } else { | |
2024 if (character <= max_char) bm->Set(offset, character); | |
2025 } | |
2026 } | |
2027 } else { | |
2028 ASSERT(text.text_type() == TextElement::CHAR_CLASS); | |
2029 RegExpCharacterClass* char_class = text.char_class(); | |
2030 ZoneGrowableArray<CharacterRange>* ranges = char_class->ranges(); | |
2031 if (char_class->is_negated()) { | |
2032 bm->SetAll(offset); | |
2033 } else { | |
2034 for (intptr_t k = 0; k < ranges->length(); k++) { | |
2035 CharacterRange& range = (*ranges)[k]; | |
2036 if (range.from() > max_char) continue; | |
2037 intptr_t to = Utils::Minimum(max_char, | |
2038 static_cast<intptr_t>(range.to())); | |
2039 bm->SetInterval(offset, Interval(range.from(), to)); | |
2040 } | |
2041 } | |
2042 offset++; | |
2043 } | |
2044 } | |
2045 if (offset >= bm->length()) { | |
2046 if (initial_offset == 0) set_bm_info(not_at_start, bm); | |
2047 return; | |
2048 } | |
2049 on_success()->FillInBMInfo(offset, | |
2050 budget - 1, | |
2051 bm, | |
2052 true); // Not at start after a text node. | |
2053 if (initial_offset == 0) set_bm_info(not_at_start, bm); | |
2054 } | |
2055 | |
2056 | |
2057 // Check for [0-9A-Z_a-z]. | |
2058 static void EmitWordCheck(RegExpMacroAssembler* assembler, | |
2059 BlockLabel* word, | |
2060 BlockLabel* non_word, | |
2061 bool fall_through_on_word) { | |
2062 if (assembler->CheckSpecialCharacterClass( | |
2063 fall_through_on_word ? 'w' : 'W', | |
2064 fall_through_on_word ? non_word : word)) { | |
2065 // Optimized implementation available. | |
2066 return; | |
2067 } | |
2068 assembler->CheckCharacterGT('z', non_word); | |
2069 assembler->CheckCharacterLT('0', non_word); | |
2070 assembler->CheckCharacterGT('a' - 1, word); | |
2071 assembler->CheckCharacterLT('9' + 1, word); | |
2072 assembler->CheckCharacterLT('A', non_word); | |
2073 assembler->CheckCharacterLT('Z' + 1, word); | |
2074 if (fall_through_on_word) { | |
2075 assembler->CheckNotCharacter('_', non_word); | |
2076 } else { | |
2077 assembler->CheckCharacter('_', word); | |
2078 } | |
2079 } | |
2080 | |
2081 | |
2082 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). | |
2083 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { | |
2084 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | |
2085 Trace::TriBool next_is_word_character = Trace::UNKNOWN; | |
2086 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE); | |
2087 BoyerMooreLookahead* lookahead = bm_info(not_at_start); | |
2088 if (lookahead == NULL) { | |
2089 intptr_t eats_at_least = | |
2090 Utils::Minimum(kMaxLookaheadForBoyerMoore, | |
2091 EatsAtLeast(kMaxLookaheadForBoyerMoore, | |
2092 kRecursionBudget, | |
2093 not_at_start)); | |
2094 if (eats_at_least >= 1) { | |
2095 BoyerMooreLookahead* bm = | |
2096 new(I) BoyerMooreLookahead(eats_at_least, compiler, I); | |
2097 FillInBMInfo(0, kRecursionBudget, bm, not_at_start); | |
2098 if (bm->at(0)->is_non_word()) | |
2099 next_is_word_character = Trace::FALSE_VALUE; | |
2100 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; | |
2101 } | |
2102 } else { | |
2103 if (lookahead->at(0)->is_non_word()) | |
2104 next_is_word_character = Trace::FALSE_VALUE; | |
2105 if (lookahead->at(0)->is_word()) | |
2106 next_is_word_character = Trace::TRUE_VALUE; | |
2107 } | |
2108 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY); | |
2109 if (next_is_word_character == Trace::UNKNOWN) { | |
2110 BlockLabel before_non_word; | |
2111 BlockLabel before_word; | |
2112 if (trace->characters_preloaded() != 1) { | |
2113 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); | |
2114 } | |
2115 // Fall through on non-word. | |
2116 EmitWordCheck(assembler, &before_word, &before_non_word, false); | |
2117 // Next character is not a word character. | |
2118 assembler->BindBlock(&before_non_word); | |
2119 BlockLabel ok; | |
2120 // Backtrack on \B (non-boundary check) if previous is a word, | |
2121 // since we know next *is not* a word and this would be a boundary. | |
2122 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); | |
2123 | |
2124 if (!assembler->IsClosed()) { | |
2125 assembler->GoTo(&ok); | |
2126 } | |
2127 | |
2128 assembler->BindBlock(&before_word); | |
2129 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); | |
2130 assembler->BindBlock(&ok); | |
2131 } else if (next_is_word_character == Trace::TRUE_VALUE) { | |
2132 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); | |
2133 } else { | |
2134 ASSERT(next_is_word_character == Trace::FALSE_VALUE); | |
2135 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); | |
2136 } | |
2137 } | |
2138 | |
2139 | |
2140 void AssertionNode::BacktrackIfPrevious( | |
2141 RegExpCompiler* compiler, | |
2142 Trace* trace, | |
2143 AssertionNode::IfPrevious backtrack_if_previous) { | |
2144 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | |
2145 Trace new_trace(*trace); | |
2146 new_trace.InvalidateCurrentCharacter(); | |
2147 | |
2148 BlockLabel fall_through, dummy; | |
2149 | |
2150 BlockLabel* non_word = backtrack_if_previous == kIsNonWord ? | |
2151 new_trace.backtrack() : | |
2152 &fall_through; | |
2153 BlockLabel* word = backtrack_if_previous == kIsNonWord ? | |
2154 &fall_through : | |
2155 new_trace.backtrack(); | |
2156 | |
2157 if (new_trace.cp_offset() == 0) { | |
2158 // The start of input counts as a non-word character, so the question is | |
2159 // decided if we are at the start. | |
2160 assembler->CheckAtStart(non_word); | |
2161 } | |
2162 // We already checked that we are not at the start of input so it must be | |
2163 // OK to load the previous character. | |
2164 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false); | |
2165 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); | |
2166 | |
2167 assembler->BindBlock(&fall_through); | |
2168 on_success()->Emit(compiler, &new_trace); | |
2169 } | |
2170 | |
2171 | |
2172 static bool DeterminedAlready(QuickCheckDetails* quick_check, intptr_t offset) { | |
2173 if (quick_check == NULL) return false; | |
2174 if (offset >= quick_check->characters()) return false; | |
2175 return quick_check->positions(offset)->determines_perfectly; | |
2176 } | |
2177 | |
2178 | |
2179 static void UpdateBoundsCheck(intptr_t index, intptr_t* checked_up_to) { | |
2180 if (index > *checked_up_to) { | |
2181 *checked_up_to = index; | |
2182 } | |
2183 } | |
2184 | |
2185 | |
2186 typedef bool EmitCharacterFunction(Isolate* isolate, | |
2187 RegExpCompiler* compiler, | |
2188 uint16_t c, | |
2189 BlockLabel* on_failure, | |
2190 intptr_t cp_offset, | |
2191 bool check, | |
2192 bool preloaded); | |
2193 | |
2194 | |
2195 static inline bool EmitSimpleCharacter(Isolate* isolate, | |
2196 RegExpCompiler* compiler, | |
2197 uint16_t c, | |
2198 BlockLabel* on_failure, | |
2199 intptr_t cp_offset, | |
2200 bool check, | |
2201 bool preloaded) { | |
2202 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | |
2203 bool bound_checked = false; | |
2204 if (!preloaded) { | |
2205 assembler->LoadCurrentCharacter( | |
2206 cp_offset, | |
2207 on_failure, | |
2208 check); | |
2209 bound_checked = true; | |
2210 } | |
2211 assembler->CheckNotCharacter(c, on_failure); | |
2212 return bound_checked; | |
2213 } | |
2214 | |
2215 | |
2216 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, | |
2217 bool ascii, | |
2218 uint16_t c1, | |
2219 uint16_t c2, | |
2220 BlockLabel* on_failure) { | |
2221 uint16_t char_mask; | |
2222 if (ascii) { | |
2223 char_mask = Symbols::kMaxOneCharCodeSymbol; | |
2224 } else { | |
2225 char_mask = Utf16::kMaxCodeUnit; | |
2226 } | |
2227 uint16_t exor = c1 ^ c2; | |
2228 // Check whether exor has only one bit set. | |
2229 if (((exor - 1) & exor) == 0) { | |
2230 // If c1 and c2 differ only by one bit. | |
2231 // Ecma262UnCanonicalize always gives the highest number last. | |
2232 ASSERT(c2 > c1); | |
2233 uint16_t mask = char_mask ^ exor; | |
2234 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); | |
2235 return true; | |
2236 } | |
2237 ASSERT(c2 > c1); | |
2238 uint16_t diff = c2 - c1; | |
2239 if (((diff - 1) & diff) == 0 && c1 >= diff) { | |
2240 // If the characters differ by 2^n but don't differ by one bit then | |
2241 // subtract the difference from the found character, then do the or | |
2242 // trick. We avoid the theoretical case where negative numbers are | |
2243 // involved in order to simplify code generation. | |
2244 uint16_t mask = char_mask ^ diff; | |
2245 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, | |
2246 diff, | |
2247 mask, | |
2248 on_failure); | |
2249 return true; | |
2250 } | |
2251 return false; | |
2252 } | |
2253 | |
2254 // Only emits letters (things that have case). Only used for case independent | |
2255 // matches. | |
2256 static inline bool EmitAtomLetter(Isolate* isolate, | |
2257 RegExpCompiler* compiler, | |
2258 uint16_t c, | |
2259 BlockLabel* on_failure, | |
2260 intptr_t cp_offset, | |
2261 bool check, | |
2262 bool preloaded) { | |
2263 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); | |
2264 bool ascii = compiler->ascii(); | |
2265 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | |
2266 intptr_t length = GetCaseIndependentLetters(c, ascii, chars); | |
2267 if (length <= 1) return false; | |
2268 // We may not need to check against the end of the input string | |
2269 // if this character lies before a character that matched. | |
2270 if (!preloaded) { | |
2271 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); | |
2272 } | |
2273 BlockLabel ok; | |
2274 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); | |
2275 switch (length) { | |
2276 case 2: { | |
2277 if (ShortCutEmitCharacterPair(macro_assembler, | |
2278 ascii, | |
2279 chars[0], | |
2280 chars[1], | |
2281 on_failure)) { | |
2282 } else { | |
2283 macro_assembler->CheckCharacter(chars[0], &ok); | |
2284 macro_assembler->CheckNotCharacter(chars[1], on_failure); | |
2285 macro_assembler->BindBlock(&ok); | |
2286 } | |
2287 break; | |
2288 } | |
2289 case 4: | |
2290 macro_assembler->CheckCharacter(chars[3], &ok); | |
2291 // Fall through! | |
2292 case 3: | |
2293 macro_assembler->CheckCharacter(chars[0], &ok); | |
2294 macro_assembler->CheckCharacter(chars[1], &ok); | |
2295 macro_assembler->CheckNotCharacter(chars[2], on_failure); | |
2296 macro_assembler->BindBlock(&ok); | |
2297 break; | |
2298 default: | |
2299 UNREACHABLE(); | |
2300 break; | |
2301 } | |
2302 return true; | |
2303 } | |
2304 | |
2305 | |
2306 // Only emits non-letters (things that don't have case). Only used for case | |
2307 // independent matches. | |
2308 static inline bool EmitAtomNonLetter(Isolate* isolate, | |
2309 RegExpCompiler* compiler, | |
2310 uint16_t c, | |
2311 BlockLabel* on_failure, | |
2312 intptr_t cp_offset, | |
2313 bool check, | |
2314 bool preloaded) { | |
2315 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); | |
2316 bool ascii = compiler->ascii(); | |
2317 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | |
2318 intptr_t length = GetCaseIndependentLetters(c, ascii, chars); | |
2319 if (length < 1) { | |
2320 // This can't match. Must be an ASCII subject and a non-ASCII character. | |
2321 // We do not need to do anything since the ASCII pass already handled this. | |
2322 return false; // Bounds not checked. | |
2323 } | |
2324 bool checked = false; | |
2325 // We handle the length > 1 case in a later pass. | |
2326 if (length == 1) { | |
2327 if (ascii && c > Symbols::kMaxOneCharCodeSymbol) { | |
2328 // Can't match - see above. | |
2329 return false; // Bounds not checked. | |
2330 } | |
2331 if (!preloaded) { | |
2332 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); | |
2333 checked = check; | |
2334 } | |
2335 macro_assembler->CheckNotCharacter(c, on_failure); | |
2336 } | |
2337 return checked; | |
2338 } | |
2339 | |
2340 | |
2341 static void EmitBoundaryTest(RegExpMacroAssembler* masm, | |
2342 intptr_t border, | |
2343 BlockLabel* fall_through, | |
2344 BlockLabel* above_or_equal, | |
2345 BlockLabel* below) { | |
2346 if (below != fall_through) { | |
2347 masm->CheckCharacterLT(border, below); | |
2348 if (above_or_equal != fall_through) masm->GoTo(above_or_equal); | |
2349 } else { | |
2350 masm->CheckCharacterGT(border - 1, above_or_equal); | |
2351 } | |
2352 } | |
2353 | |
2354 | |
2355 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, | |
2356 intptr_t first, | |
2357 intptr_t last, | |
2358 BlockLabel* fall_through, | |
2359 BlockLabel* in_range, | |
2360 BlockLabel* out_of_range) { | |
2361 if (in_range == fall_through) { | |
2362 if (first == last) { | |
2363 masm->CheckNotCharacter(first, out_of_range); | |
2364 } else { | |
2365 masm->CheckCharacterNotInRange(first, last, out_of_range); | |
2366 } | |
2367 } else { | |
2368 if (first == last) { | |
2369 masm->CheckCharacter(first, in_range); | |
2370 } else { | |
2371 masm->CheckCharacterInRange(first, last, in_range); | |
2372 } | |
2373 if (out_of_range != fall_through) masm->GoTo(out_of_range); | |
2374 } | |
2375 } | |
2376 | |
2377 | |
2378 static void CutOutRange(RegExpMacroAssembler* masm, | |
2379 ZoneGrowableArray<int>* ranges, | |
2380 intptr_t start_index, | |
2381 intptr_t end_index, | |
2382 intptr_t cut_index, | |
2383 BlockLabel* even_label, | |
2384 BlockLabel* odd_label) { | |
2385 bool odd = (((cut_index - start_index) & 1) == 1); | |
2386 BlockLabel* in_range_label = odd ? odd_label : even_label; | |
2387 BlockLabel dummy; | |
2388 EmitDoubleBoundaryTest(masm, | |
2389 ranges->At(cut_index), | |
2390 ranges->At(cut_index + 1) - 1, | |
2391 &dummy, | |
2392 in_range_label, | |
2393 &dummy); | |
2394 ASSERT(!dummy.IsLinked()); | |
2395 // Cut out the single range by rewriting the array. This creates a new | |
2396 // range that is a merger of the two ranges on either side of the one we | |
2397 // are cutting out. The oddity of the labels is preserved. | |
2398 for (intptr_t j = cut_index; j > start_index; j--) { | |
2399 (*ranges)[j] = ranges->At(j - 1); | |
2400 } | |
2401 for (intptr_t j = cut_index + 1; j < end_index; j++) { | |
2402 (*ranges)[j] = ranges->At(j + 1); | |
2403 } | |
2404 } | |
2405 | |
2406 | |
2407 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even. | |
2408 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd. | |
2409 static void EmitUseLookupTable( | |
2410 RegExpMacroAssembler* masm, | |
2411 ZoneGrowableArray<int>* ranges, | |
2412 intptr_t start_index, | |
2413 intptr_t end_index, | |
2414 intptr_t min_char, | |
2415 BlockLabel* fall_through, | |
2416 BlockLabel* even_label, | |
2417 BlockLabel* odd_label) { | |
2418 static const intptr_t kSize = RegExpMacroAssembler::kTableSize; | |
2419 static const intptr_t kMask = RegExpMacroAssembler::kTableMask; | |
2420 | |
2421 intptr_t base = (min_char & ~kMask); | |
2422 | |
2423 // Assert that everything is on one kTableSize page. | |
2424 for (intptr_t i = start_index; i <= end_index; i++) { | |
2425 ASSERT((ranges->At(i) & ~kMask) == base); | |
2426 } | |
2427 ASSERT(start_index == 0 || (ranges->At(start_index - 1) & ~kMask) <= base); | |
2428 | |
2429 char templ[kSize]; | |
2430 BlockLabel* on_bit_set; | |
2431 BlockLabel* on_bit_clear; | |
2432 intptr_t bit; | |
2433 if (even_label == fall_through) { | |
2434 on_bit_set = odd_label; | |
2435 on_bit_clear = even_label; | |
2436 bit = 1; | |
2437 } else { | |
2438 on_bit_set = even_label; | |
2439 on_bit_clear = odd_label; | |
2440 bit = 0; | |
2441 } | |
2442 for (intptr_t i = 0; i < (ranges->At(start_index) & kMask) && i < kSize; | |
2443 i++) { | |
2444 templ[i] = bit; | |
2445 } | |
2446 intptr_t j = 0; | |
2447 bit ^= 1; | |
2448 for (intptr_t i = start_index; i < end_index; i++) { | |
2449 for (j = (ranges->At(i) & kMask); j < (ranges->At(i + 1) & kMask); j++) { | |
2450 templ[j] = bit; | |
2451 } | |
2452 bit ^= 1; | |
2453 } | |
2454 for (intptr_t i = j; i < kSize; i++) { | |
2455 templ[i] = bit; | |
2456 } | |
2457 // TODO(erikcorry): Cache these. | |
2458 const TypedData& ba = TypedData::ZoneHandle( | |
2459 masm->isolate(), | |
2460 TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld)); | |
2461 for (intptr_t i = 0; i < kSize; i++) { | |
2462 ba.SetUint8(i, templ[i]); | |
2463 } | |
2464 masm->CheckBitInTable(ba, on_bit_set); | |
2465 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear); | |
2466 } | |
2467 | |
2468 | |
2469 // Unicode case. Split the search space into kSize spaces that are handled | |
2470 // with recursion. | |
2471 static void SplitSearchSpace(ZoneGrowableArray<int>* ranges, | |
2472 intptr_t start_index, | |
2473 intptr_t end_index, | |
2474 intptr_t* new_start_index, | |
2475 intptr_t* new_end_index, | |
2476 intptr_t* border) { | |
2477 static const intptr_t kSize = RegExpMacroAssembler::kTableSize; | |
2478 static const intptr_t kMask = RegExpMacroAssembler::kTableMask; | |
2479 | |
2480 intptr_t first = ranges->At(start_index); | |
2481 intptr_t last = ranges->At(end_index) - 1; | |
2482 | |
2483 *new_start_index = start_index; | |
2484 *border = (ranges->At(start_index) & ~kMask) + kSize; | |
2485 while (*new_start_index < end_index) { | |
2486 if (ranges->At(*new_start_index) > *border) break; | |
2487 (*new_start_index)++; | |
2488 } | |
2489 // new_start_index is the index of the first edge that is beyond the | |
2490 // current kSize space. | |
2491 | |
2492 // For very large search spaces we do a binary chop search of the non-ASCII | |
2493 // space instead of just going to the end of the current kSize space. The | |
2494 // heuristics are complicated a little by the fact that any 128-character | |
2495 // encoding space can be quickly tested with a table lookup, so we don't | |
2496 // wish to do binary chop search at a smaller granularity than that. A | |
2497 // 128-character space can take up a lot of space in the ranges array if, | |
2498 // for example, we only want to match every second character (eg. the lower | |
2499 // case characters on some Unicode pages). | |
2500 intptr_t binary_chop_index = (end_index + start_index) / 2; | |
2501 // The first test ensures that we get to the code that handles the ASCII | |
2502 // range with a single not-taken branch, speeding up this important | |
2503 // character range (even non-ASCII charset-based text has spaces and | |
2504 // punctuation). | |
2505 if (*border - 1 > Symbols::kMaxOneCharCodeSymbol && // ASCII case. | |
2506 end_index - start_index > (*new_start_index - start_index) * 2 && | |
2507 last - first > kSize * 2 && | |
2508 binary_chop_index > *new_start_index && | |
2509 ranges->At(binary_chop_index) >= first + 2 * kSize) { | |
2510 intptr_t scan_forward_for_section_border = binary_chop_index;; | |
2511 intptr_t new_border = (ranges->At(binary_chop_index) | kMask) + 1; | |
2512 | |
2513 while (scan_forward_for_section_border < end_index) { | |
2514 if (ranges->At(scan_forward_for_section_border) > new_border) { | |
2515 *new_start_index = scan_forward_for_section_border; | |
2516 *border = new_border; | |
2517 break; | |
2518 } | |
2519 scan_forward_for_section_border++; | |
2520 } | |
2521 } | |
2522 | |
2523 ASSERT(*new_start_index > start_index); | |
2524 *new_end_index = *new_start_index - 1; | |
2525 if (ranges->At(*new_end_index) == *border) { | |
2526 (*new_end_index)--; | |
2527 } | |
2528 if (*border >= ranges->At(end_index)) { | |
2529 *border = ranges->At(end_index); | |
2530 *new_start_index = end_index; // Won't be used. | |
2531 *new_end_index = end_index - 1; | |
2532 } | |
2533 } | |
2534 | |
2535 | |
2536 // Gets a series of segment boundaries representing a character class. If the | |
2537 // character is in the range between an even and an odd boundary (counting from | |
2538 // start_index) then go to even_label, otherwise go to odd_label. We already | |
2539 // know that the character is in the range of min_char to max_char inclusive. | |
2540 // Either label can be NULL indicating backtracking. Either label can also be | |
2541 // equal to the fall_through label. | |
2542 static void GenerateBranches(RegExpMacroAssembler* masm, | |
2543 ZoneGrowableArray<int>* ranges, | |
2544 intptr_t start_index, | |
2545 intptr_t end_index, | |
2546 uint16_t min_char, | |
2547 uint16_t max_char, | |
2548 BlockLabel* fall_through, | |
2549 BlockLabel* even_label, | |
2550 BlockLabel* odd_label) { | |
2551 intptr_t first = ranges->At(start_index); | |
2552 intptr_t last = ranges->At(end_index) - 1; | |
2553 | |
2554 ASSERT(min_char < first); | |
2555 | |
2556 // Just need to test if the character is before or on-or-after | |
2557 // a particular character. | |
2558 if (start_index == end_index) { | |
2559 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label); | |
2560 return; | |
2561 } | |
2562 | |
2563 // Another almost trivial case: There is one interval in the middle that is | |
2564 // different from the end intervals. | |
2565 if (start_index + 1 == end_index) { | |
2566 EmitDoubleBoundaryTest( | |
2567 masm, first, last, fall_through, even_label, odd_label); | |
2568 return; | |
2569 } | |
2570 | |
2571 // It's not worth using table lookup if there are very few intervals in the | |
2572 // character class. | |
2573 if (end_index - start_index <= 6) { | |
2574 // It is faster to test for individual characters, so we look for those | |
2575 // first, then try arbitrary ranges in the second round. | |
2576 static intptr_t kNoCutIndex = -1; | |
2577 intptr_t cut = kNoCutIndex; | |
2578 for (intptr_t i = start_index; i < end_index; i++) { | |
2579 if (ranges->At(i) == ranges->At(i + 1) - 1) { | |
2580 cut = i; | |
2581 break; | |
2582 } | |
2583 } | |
2584 if (cut == kNoCutIndex) cut = start_index; | |
2585 CutOutRange( | |
2586 masm, ranges, start_index, end_index, cut, even_label, odd_label); | |
2587 ASSERT(end_index - start_index >= 2); | |
2588 GenerateBranches(masm, | |
2589 ranges, | |
2590 start_index + 1, | |
2591 end_index - 1, | |
2592 min_char, | |
2593 max_char, | |
2594 fall_through, | |
2595 even_label, | |
2596 odd_label); | |
2597 return; | |
2598 } | |
2599 | |
2600 // If there are a lot of intervals in the regexp, then we will use tables to | |
2601 // determine whether the character is inside or outside the character class. | |
2602 static const intptr_t kBits = RegExpMacroAssembler::kTableSizeBits; | |
2603 | |
2604 if ((max_char >> kBits) == (min_char >> kBits)) { | |
2605 EmitUseLookupTable(masm, | |
2606 ranges, | |
2607 start_index, | |
2608 end_index, | |
2609 min_char, | |
2610 fall_through, | |
2611 even_label, | |
2612 odd_label); | |
2613 return; | |
2614 } | |
2615 | |
2616 if ((min_char >> kBits) != (first >> kBits)) { | |
2617 masm->CheckCharacterLT(first, odd_label); | |
2618 GenerateBranches(masm, | |
2619 ranges, | |
2620 start_index + 1, | |
2621 end_index, | |
2622 first, | |
2623 max_char, | |
2624 fall_through, | |
2625 odd_label, | |
2626 even_label); | |
2627 return; | |
2628 } | |
2629 | |
2630 intptr_t new_start_index = 0; | |
2631 intptr_t new_end_index = 0; | |
2632 intptr_t border = 0; | |
2633 | |
2634 SplitSearchSpace(ranges, | |
2635 start_index, | |
2636 end_index, | |
2637 &new_start_index, | |
2638 &new_end_index, | |
2639 &border); | |
2640 | |
2641 BlockLabel handle_rest; | |
2642 BlockLabel* above = &handle_rest; | |
2643 if (border == last + 1) { | |
2644 // We didn't find any section that started after the limit, so everything | |
2645 // above the border is one of the terminal labels. | |
2646 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label; | |
2647 ASSERT(new_end_index == end_index - 1); | |
2648 } | |
2649 | |
2650 ASSERT(start_index <= new_end_index); | |
2651 ASSERT(new_start_index <= end_index); | |
2652 ASSERT(start_index < new_start_index); | |
2653 ASSERT(new_end_index < end_index); | |
2654 ASSERT(new_end_index + 1 == new_start_index || | |
2655 (new_end_index + 2 == new_start_index && | |
2656 border == ranges->At(new_end_index + 1))); | |
2657 ASSERT(min_char < border - 1); | |
2658 ASSERT(border < max_char); | |
2659 ASSERT(ranges->At(new_end_index) < border); | |
2660 ASSERT(border < ranges->At(new_start_index) || | |
2661 (border == ranges->At(new_start_index) && | |
2662 new_start_index == end_index && | |
2663 new_end_index == end_index - 1 && | |
2664 border == last + 1)); | |
2665 ASSERT(new_start_index == 0 || border >= ranges->At(new_start_index - 1)); | |
2666 | |
2667 masm->CheckCharacterGT(border - 1, above); | |
2668 BlockLabel dummy; | |
2669 GenerateBranches(masm, | |
2670 ranges, | |
2671 start_index, | |
2672 new_end_index, | |
2673 min_char, | |
2674 border - 1, | |
2675 &dummy, | |
2676 even_label, | |
2677 odd_label); | |
2678 | |
2679 if (handle_rest.IsLinked()) { | |
2680 masm->BindBlock(&handle_rest); | |
2681 bool flip = (new_start_index & 1) != (start_index & 1); | |
2682 GenerateBranches(masm, | |
2683 ranges, | |
2684 new_start_index, | |
2685 end_index, | |
2686 border, | |
2687 max_char, | |
2688 &dummy, | |
2689 flip ? odd_label : even_label, | |
2690 flip ? even_label : odd_label); | |
2691 } | |
2692 } | |
2693 | |
2694 | |
2695 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, | |
2696 RegExpCharacterClass* cc, | |
2697 bool ascii, | |
2698 BlockLabel* on_failure, | |
2699 intptr_t cp_offset, | |
2700 bool check_offset, | |
2701 bool preloaded, | |
2702 Isolate* isolate) { | |
2703 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges(); | |
2704 if (!CharacterRange::IsCanonical(ranges)) { | |
2705 CharacterRange::Canonicalize(ranges); | |
2706 } | |
2707 | |
2708 intptr_t max_char; | |
2709 if (ascii) { | |
2710 max_char = Symbols::kMaxOneCharCodeSymbol; | |
2711 } else { | |
2712 max_char = Utf16::kMaxCodeUnit; | |
2713 } | |
2714 | |
2715 intptr_t range_count = ranges->length(); | |
2716 | |
2717 intptr_t last_valid_range = range_count - 1; | |
2718 while (last_valid_range >= 0) { | |
2719 CharacterRange& range = (*ranges)[last_valid_range]; | |
2720 if (range.from() <= max_char) { | |
2721 break; | |
2722 } | |
2723 last_valid_range--; | |
2724 } | |
2725 | |
2726 if (last_valid_range < 0) { | |
2727 if (!cc->is_negated()) { | |
2728 macro_assembler->GoTo(on_failure); | |
2729 } | |
2730 if (check_offset) { | |
2731 macro_assembler->CheckPosition(cp_offset, on_failure); | |
2732 } | |
2733 return; | |
2734 } | |
2735 | |
2736 if (last_valid_range == 0 && | |
2737 ranges->At(0).IsEverything(max_char)) { | |
2738 if (cc->is_negated()) { | |
2739 macro_assembler->GoTo(on_failure); | |
2740 } else { | |
2741 // This is a common case hit by non-anchored expressions. | |
2742 if (check_offset) { | |
2743 macro_assembler->CheckPosition(cp_offset, on_failure); | |
2744 } | |
2745 } | |
2746 return; | |
2747 } | |
2748 if (last_valid_range == 0 && | |
2749 !cc->is_negated() && | |
2750 ranges->At(0).IsEverything(max_char)) { | |
2751 // This is a common case hit by non-anchored expressions. | |
2752 if (check_offset) { | |
2753 macro_assembler->CheckPosition(cp_offset, on_failure); | |
2754 } | |
2755 return; | |
2756 } | |
2757 | |
2758 if (!preloaded) { | |
2759 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); | |
2760 } | |
2761 | |
2762 if (cc->is_standard() && | |
2763 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), | |
2764 on_failure)) { | |
2765 return; | |
2766 } | |
2767 | |
2768 | |
2769 // A new list with ascending entries. Each entry is a code unit | |
2770 // where there is a boundary between code units that are part of | |
2771 // the class and code units that are not. Normally we insert an | |
2772 // entry at zero which goes to the failure label, but if there | |
2773 // was already one there we fall through for success on that entry. | |
2774 // Subsequent entries have alternating meaning (success/failure). | |
2775 ZoneGrowableArray<int>* range_boundaries = | |
2776 new(isolate) ZoneGrowableArray<int>(last_valid_range); | |
2777 | |
2778 bool zeroth_entry_is_failure = !cc->is_negated(); | |
2779 | |
2780 for (intptr_t i = 0; i <= last_valid_range; i++) { | |
2781 CharacterRange& range = (*ranges)[i]; | |
2782 if (range.from() == 0) { | |
2783 ASSERT(i == 0); | |
2784 zeroth_entry_is_failure = !zeroth_entry_is_failure; | |
2785 } else { | |
2786 range_boundaries->Add(range.from()); | |
2787 } | |
2788 range_boundaries->Add(range.to() + 1); | |
2789 } | |
2790 intptr_t end_index = range_boundaries->length() - 1; | |
2791 if (range_boundaries->At(end_index) > max_char) { | |
2792 end_index--; | |
2793 } | |
2794 | |
2795 BlockLabel fall_through; | |
2796 GenerateBranches(macro_assembler, | |
2797 range_boundaries, | |
2798 0, // start_index. | |
2799 end_index, | |
2800 0, // min_char. | |
2801 max_char, | |
2802 &fall_through, | |
2803 zeroth_entry_is_failure ? &fall_through : on_failure, | |
2804 zeroth_entry_is_failure ? on_failure : &fall_through); | |
2805 macro_assembler->BindBlock(&fall_through); | |
2806 } | |
2807 | |
2808 | |
2809 // We call this repeatedly to generate code for each pass over the text node. | |
2810 // The passes are in increasing order of difficulty because we hope one | |
2811 // of the first passes will fail in which case we are saved the work of the | |
2812 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ | |
2813 // we will check the '%' in the first pass, the case independent 'a' in the | |
2814 // second pass and the character class in the last pass. | |
2815 // | |
2816 // The passes are done from right to left, so for example to test for /bar/ | |
2817 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 | |
2818 // and then a 'b' with offset 0. This means we can avoid the end-of-input | |
2819 // bounds check most of the time. In the example we only need to check for | |
2820 // end-of-input when loading the putative 'r'. | |
2821 // | |
2822 // A slight complication involves the fact that the first character may already | |
2823 // be fetched into a register by the previous node. In this case we want to | |
2824 // do the test for that character first. We do this in separate passes. The | |
2825 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a | |
2826 // pass has been performed then subsequent passes will have true in | |
2827 // first_element_checked to indicate that that character does not need to be | |
2828 // checked again. | |
2829 // | |
2830 // In addition to all this we are passed a Trace, which can | |
2831 // contain an AlternativeGeneration object. In this AlternativeGeneration | |
2832 // object we can see details of any quick check that was already passed in | |
2833 // order to get to the code we are now generating. The quick check can involve | |
2834 // loading characters, which means we do not need to recheck the bounds | |
2835 // up to the limit the quick check already checked. In addition the quick | |
2836 // check can have involved a mask and compare operation which may simplify | |
2837 // or obviate the need for further checks at some character positions. | |
2838 void TextNode::TextEmitPass(RegExpCompiler* compiler, | |
2839 TextEmitPassType pass, | |
2840 bool preloaded, | |
2841 Trace* trace, | |
2842 bool first_element_checked, | |
2843 intptr_t* checked_up_to) { | |
2844 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | |
2845 bool ascii = compiler->ascii(); | |
2846 BlockLabel* backtrack = trace->backtrack(); | |
2847 QuickCheckDetails* quick_check = trace->quick_check_performed(); | |
2848 intptr_t element_count = elms_->length(); | |
2849 for (intptr_t i = preloaded ? 0 : element_count - 1; i >= 0; i--) { | |
2850 TextElement elm = elms_->At(i); | |
2851 intptr_t cp_offset = trace->cp_offset() + elm.cp_offset(); | |
2852 if (elm.text_type() == TextElement::ATOM) { | |
2853 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data(); | |
2854 for (intptr_t j = preloaded ? 0 : quarks->length() - 1; j >= 0; j--) { | |
2855 if (first_element_checked && i == 0 && j == 0) continue; | |
2856 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; | |
2857 EmitCharacterFunction* emit_function = NULL; | |
2858 switch (pass) { | |
2859 case NON_ASCII_MATCH: | |
2860 ASSERT(ascii); | |
2861 if (quarks->At(j) > Symbols::kMaxOneCharCodeSymbol) { | |
2862 assembler->GoTo(backtrack); | |
2863 return; | |
2864 } | |
2865 break; | |
2866 case NON_LETTER_CHARACTER_MATCH: | |
2867 emit_function = &EmitAtomNonLetter; | |
2868 break; | |
2869 case SIMPLE_CHARACTER_MATCH: | |
2870 emit_function = &EmitSimpleCharacter; | |
2871 break; | |
2872 case CASE_CHARACTER_MATCH: | |
2873 emit_function = &EmitAtomLetter; | |
2874 break; | |
2875 default: | |
2876 break; | |
2877 } | |
2878 if (emit_function != NULL) { | |
2879 bool bound_checked = emit_function(I, | |
2880 compiler, | |
2881 quarks->At(j), | |
2882 backtrack, | |
2883 cp_offset + j, | |
2884 *checked_up_to < cp_offset + j, | |
2885 preloaded); | |
2886 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); | |
2887 } | |
2888 } | |
2889 } else { | |
2890 ASSERT(elm.text_type() == TextElement::CHAR_CLASS); | |
2891 if (pass == CHARACTER_CLASS_MATCH) { | |
2892 if (first_element_checked && i == 0) continue; | |
2893 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; | |
2894 RegExpCharacterClass* cc = elm.char_class(); | |
2895 EmitCharClass(assembler, | |
2896 cc, | |
2897 ascii, | |
2898 backtrack, | |
2899 cp_offset, | |
2900 *checked_up_to < cp_offset, | |
2901 preloaded, | |
2902 I); | |
2903 UpdateBoundsCheck(cp_offset, checked_up_to); | |
2904 } | |
2905 } | |
2906 } | |
2907 } | |
2908 | |
2909 | |
2910 intptr_t ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, | |
2911 intptr_t eats_at_least) { | |
2912 intptr_t preload_characters = Utils::Minimum(static_cast<intptr_t>(4), | |
2913 eats_at_least); | |
2914 if (compiler->macro_assembler()->CanReadUnaligned()) { | |
2915 bool ascii = compiler->ascii(); | |
2916 if (ascii) { | |
2917 if (preload_characters > 4) preload_characters = 4; | |
2918 // We can't preload 3 characters because there is no machine instruction | |
2919 // to do that. We can't just load 4 because we could be reading | |
2920 // beyond the end of the string, which could cause a memory fault. | |
2921 if (preload_characters == 3) preload_characters = 2; | |
2922 } else { | |
2923 if (preload_characters > 2) preload_characters = 2; | |
2924 } | |
2925 } else { | |
2926 if (preload_characters > 1) preload_characters = 1; | |
2927 } | |
2928 return preload_characters; | |
2929 } | |
2930 | |
2931 | |
2932 intptr_t TextNode::EatsAtLeast(intptr_t still_to_find, | |
2933 intptr_t budget, | |
2934 bool not_at_start) { | |
2935 intptr_t answer = Length(); | |
2936 if (answer >= still_to_find) return answer; | |
2937 if (budget <= 0) return answer; | |
2938 // We are not at start after this node so we set the last argument to 'true'. | |
2939 return answer + on_success()->EatsAtLeast(still_to_find - answer, | |
2940 budget - 1, | |
2941 true); | |
2942 } | |
2943 | |
2944 | |
2945 intptr_t ActionNode::EatsAtLeast(intptr_t still_to_find, | |
2946 intptr_t budget, | |
2947 bool not_at_start) { | |
2948 if (budget <= 0) return 0; | |
2949 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! | |
2950 return on_success()->EatsAtLeast(still_to_find, | |
2951 budget - 1, | |
2952 not_at_start); | |
2953 } | |
2954 | |
2955 | |
2956 intptr_t AssertionNode::EatsAtLeast(intptr_t still_to_find, | |
2957 intptr_t budget, | |
2958 bool not_at_start) { | |
2959 if (budget <= 0) return 0; | |
2960 // If we know we are not at the start and we are asked "how many characters | |
2961 // will you match if you succeed?" then we can answer anything since false | |
2962 // implies false. So lets just return the max answer (still_to_find) since | |
2963 // that won't prevent us from preloading a lot of characters for the other | |
2964 // branches in the node graph. | |
2965 if (assertion_type() == AT_START && not_at_start) return still_to_find; | |
2966 return on_success()->EatsAtLeast(still_to_find, | |
2967 budget - 1, | |
2968 not_at_start); | |
2969 } | |
2970 | |
2971 | |
2972 intptr_t BackReferenceNode::EatsAtLeast(intptr_t still_to_find, | |
2973 intptr_t budget, | |
2974 bool not_at_start) { | |
2975 if (budget <= 0) return 0; | |
2976 return on_success()->EatsAtLeast(still_to_find, | |
2977 budget - 1, | |
2978 not_at_start); | |
2979 } | |
2980 | |
2981 | |
2982 intptr_t ChoiceNode::EatsAtLeastHelper(intptr_t still_to_find, | |
2983 intptr_t budget, | |
2984 RegExpNode* ignore_this_node, | |
2985 bool not_at_start) { | |
2986 if (budget <= 0) return 0; | |
2987 intptr_t min = 100; | |
2988 intptr_t choice_count = alternatives_->length(); | |
2989 budget = (budget - 1) / choice_count; | |
2990 for (intptr_t i = 0; i < choice_count; i++) { | |
2991 RegExpNode* node = (*alternatives_)[i].node(); | |
2992 if (node == ignore_this_node) continue; | |
2993 intptr_t node_eats_at_least = | |
2994 node->EatsAtLeast(still_to_find, budget, not_at_start); | |
2995 if (node_eats_at_least < min) min = node_eats_at_least; | |
2996 if (min == 0) return 0; | |
2997 } | |
2998 return min; | |
2999 } | |
3000 | |
3001 | |
3002 intptr_t ChoiceNode::EatsAtLeast(intptr_t still_to_find, | |
3003 intptr_t budget, | |
3004 bool not_at_start) { | |
3005 return EatsAtLeastHelper(still_to_find, | |
3006 budget, | |
3007 NULL, | |
3008 not_at_start); | |
3009 } | |
3010 | |
3011 | |
3012 intptr_t NegativeLookaheadChoiceNode::EatsAtLeast(intptr_t still_to_find, | |
3013 intptr_t budget, | |
3014 bool not_at_start) { | |
3015 if (budget <= 0) return 0; | |
3016 // Alternative 0 is the negative lookahead, alternative 1 is what comes | |
3017 // afterwards. | |
3018 RegExpNode* node = (*alternatives_)[1].node(); | |
3019 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start); | |
3020 } | |
3021 | |
3022 | |
3023 // Takes the left-most 1-bit and smears it out, setting all bits to its right. | |
3024 static inline uint32_t SmearBitsRight(uint32_t v) { | |
3025 v |= v >> 1; | |
3026 v |= v >> 2; | |
3027 v |= v >> 4; | |
3028 v |= v >> 8; | |
3029 v |= v >> 16; | |
3030 return v; | |
3031 } | |
3032 | |
3033 | |
3034 // Here is the meat of GetQuickCheckDetails (see also the comment on the | |
3035 // super-class in the .h file). | |
3036 // | |
3037 // We iterate along the text object, building up for each character a | |
3038 // mask and value that can be used to test for a quick failure to match. | |
3039 // The masks and values for the positions will be combined into a single | |
3040 // machine word for the current character width in order to be used in | |
3041 // generating a quick check. | |
3042 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, | |
3043 RegExpCompiler* compiler, | |
3044 intptr_t characters_filled_in, | |
3045 bool not_at_start) { | |
3046 ASSERT(characters_filled_in < details->characters()); | |
3047 intptr_t characters = details->characters(); | |
3048 intptr_t char_mask; | |
3049 if (compiler->ascii()) { | |
3050 char_mask = Symbols::kMaxOneCharCodeSymbol; | |
3051 } else { | |
3052 char_mask = Utf16::kMaxCodeUnit; | |
3053 } | |
3054 for (intptr_t k = 0; k < elms_->length(); k++) { | |
3055 TextElement elm = elms_->At(k); | |
3056 if (elm.text_type() == TextElement::ATOM) { | |
3057 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data(); | |
3058 for (intptr_t i = 0; i < characters && i < quarks->length(); i++) { | |
3059 QuickCheckDetails::Position* pos = | |
3060 details->positions(characters_filled_in); | |
3061 uint16_t c = quarks->At(i); | |
3062 if (c > char_mask) { | |
3063 // If we expect a non-ASCII character from an ASCII string, | |
3064 // there is no way we can match. Not even case independent | |
3065 // matching can turn an ASCII character into non-ASCII or | |
3066 // vice versa. | |
3067 details->set_cannot_match(); | |
3068 pos->determines_perfectly = false; | |
3069 return; | |
3070 } | |
3071 if (compiler->ignore_case()) { | |
3072 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | |
3073 intptr_t length = | |
3074 GetCaseIndependentLetters(c, compiler->ascii(), chars); | |
3075 ASSERT(length != 0); // Can only happen if c > char_mask (see above). | |
3076 if (length == 1) { | |
3077 // This letter has no case equivalents, so it's nice and simple | |
3078 // and the mask-compare will determine definitely whether we have | |
3079 // a match at this character position. | |
3080 pos->mask = char_mask; | |
3081 pos->value = c; | |
3082 pos->determines_perfectly = true; | |
3083 } else { | |
3084 uint32_t common_bits = char_mask; | |
3085 uint32_t bits = chars[0]; | |
3086 for (intptr_t j = 1; j < length; j++) { | |
3087 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); | |
3088 common_bits ^= differing_bits; | |
3089 bits &= common_bits; | |
3090 } | |
3091 // If length is 2 and common bits has only one zero in it then | |
3092 // our mask and compare instruction will determine definitely | |
3093 // whether we have a match at this character position. Otherwise | |
3094 // it can only be an approximate check. | |
3095 uint32_t one_zero = (common_bits | ~char_mask); | |
3096 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { | |
3097 pos->determines_perfectly = true; | |
3098 } | |
3099 pos->mask = common_bits; | |
3100 pos->value = bits; | |
3101 } | |
3102 } else { | |
3103 // Don't ignore case. Nice simple case where the mask-compare will | |
3104 // determine definitely whether we have a match at this character | |
3105 // position. | |
3106 pos->mask = char_mask; | |
3107 pos->value = c; | |
3108 pos->determines_perfectly = true; | |
3109 } | |
3110 characters_filled_in++; | |
3111 ASSERT(characters_filled_in <= details->characters()); | |
3112 if (characters_filled_in == details->characters()) { | |
3113 return; | |
3114 } | |
3115 } | |
3116 } else { | |
3117 QuickCheckDetails::Position* pos = | |
3118 details->positions(characters_filled_in); | |
3119 RegExpCharacterClass* tree = elm.char_class(); | |
3120 ZoneGrowableArray<CharacterRange>* ranges = tree->ranges(); | |
3121 if (tree->is_negated()) { | |
3122 // A quick check uses multi-character mask and compare. There is no | |
3123 // useful way to incorporate a negative char class into this scheme | |
3124 // so we just conservatively create a mask and value that will always | |
3125 // succeed. | |
3126 pos->mask = 0; | |
3127 pos->value = 0; | |
3128 } else { | |
3129 intptr_t first_range = 0; | |
3130 while (ranges->At(first_range).from() > char_mask) { | |
3131 first_range++; | |
3132 if (first_range == ranges->length()) { | |
3133 details->set_cannot_match(); | |
3134 pos->determines_perfectly = false; | |
3135 return; | |
3136 } | |
3137 } | |
3138 CharacterRange range = ranges->At(first_range); | |
3139 uint16_t from = range.from(); | |
3140 uint16_t to = range.to(); | |
3141 if (to > char_mask) { | |
3142 to = char_mask; | |
3143 } | |
3144 uint32_t differing_bits = (from ^ to); | |
3145 // A mask and compare is only perfect if the differing bits form a | |
3146 // number like 00011111 with one single block of trailing 1s. | |
3147 if ((differing_bits & (differing_bits + 1)) == 0 && | |
3148 from + differing_bits == to) { | |
3149 pos->determines_perfectly = true; | |
3150 } | |
3151 uint32_t common_bits = ~SmearBitsRight(differing_bits); | |
3152 uint32_t bits = (from & common_bits); | |
3153 for (intptr_t i = first_range + 1; i < ranges->length(); i++) { | |
3154 CharacterRange range = ranges->At(i); | |
3155 uint16_t from = range.from(); | |
3156 uint16_t to = range.to(); | |
3157 if (from > char_mask) continue; | |
3158 if (to > char_mask) to = char_mask; | |
3159 // Here we are combining more ranges into the mask and compare | |
3160 // value. With each new range the mask becomes more sparse and | |
3161 // so the chances of a false positive rise. A character class | |
3162 // with multiple ranges is assumed never to be equivalent to a | |
3163 // mask and compare operation. | |
3164 pos->determines_perfectly = false; | |
3165 uint32_t new_common_bits = (from ^ to); | |
3166 new_common_bits = ~SmearBitsRight(new_common_bits); | |
3167 common_bits &= new_common_bits; | |
3168 bits &= new_common_bits; | |
3169 uint32_t differing_bits = (from & common_bits) ^ bits; | |
3170 common_bits ^= differing_bits; | |
3171 bits &= common_bits; | |
3172 } | |
3173 pos->mask = common_bits; | |
3174 pos->value = bits; | |
3175 } | |
3176 characters_filled_in++; | |
3177 ASSERT(characters_filled_in <= details->characters()); | |
3178 if (characters_filled_in == details->characters()) { | |
3179 return; | |
3180 } | |
3181 } | |
3182 } | |
3183 ASSERT(characters_filled_in != details->characters()); | |
3184 if (!details->cannot_match()) { | |
3185 on_success()-> GetQuickCheckDetails(details, | |
3186 compiler, | |
3187 characters_filled_in, | |
3188 true); | |
3189 } | |
3190 } | |
3191 | |
3192 | |
3193 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, | |
3194 RegExpCompiler* compiler, | |
3195 intptr_t characters_filled_in, | |
3196 bool not_at_start) { | |
3197 if (body_can_be_zero_length_ || info()->visited) return; | |
3198 VisitMarker marker(info()); | |
3199 return ChoiceNode::GetQuickCheckDetails(details, | |
3200 compiler, | |
3201 characters_filled_in, | |
3202 not_at_start); | |
3203 } | |
3204 | |
3205 | |
3206 intptr_t TextNode::Length() { | |
3207 TextElement elm = elms_->Last(); | |
3208 ASSERT(elm.cp_offset() >= 0); | |
3209 return elm.cp_offset() + elm.length(); | |
3210 } | |
3211 | |
3212 | |
3213 bool TextNode::SkipPass(intptr_t intptr_t_pass, bool ignore_case) { | |
3214 TextEmitPassType pass = static_cast<TextEmitPassType>(intptr_t_pass); | |
3215 if (ignore_case) { | |
3216 return pass == SIMPLE_CHARACTER_MATCH; | |
3217 } else { | |
3218 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; | |
3219 } | |
3220 } | |
3221 | |
3222 | |
3223 static bool CompareInverseRanges(ZoneGrowableArray<CharacterRange>* ranges, | |
3224 const intptr_t* special_class, | |
3225 intptr_t length) { | |
3226 length--; // Remove final 0x10000. | |
3227 ASSERT(special_class[length] == 0x10000); | |
3228 ASSERT(ranges->length() != 0); | |
3229 ASSERT(length != 0); | |
3230 ASSERT(special_class[0] != 0); | |
3231 if (ranges->length() != (length >> 1) + 1) { | |
3232 return false; | |
3233 } | |
3234 CharacterRange range = ranges->At(0); | |
3235 if (range.from() != 0) { | |
3236 return false; | |
3237 } | |
3238 for (intptr_t i = 0; i < length; i += 2) { | |
3239 if (special_class[i] != (range.to() + 1)) { | |
3240 return false; | |
3241 } | |
3242 range = ranges->At((i >> 1) + 1); | |
3243 if (special_class[i+1] != range.from()) { | |
3244 return false; | |
3245 } | |
3246 } | |
3247 if (range.to() != 0xffff) { | |
3248 return false; | |
3249 } | |
3250 return true; | |
3251 } | |
3252 | |
3253 | |
3254 static bool CompareRanges(ZoneGrowableArray<CharacterRange>* ranges, | |
3255 const intptr_t* special_class, | |
3256 intptr_t length) { | |
3257 length--; // Remove final 0x10000. | |
3258 ASSERT(special_class[length] == 0x10000); | |
3259 if (ranges->length() * 2 != length) { | |
3260 return false; | |
3261 } | |
3262 for (intptr_t i = 0; i < length; i += 2) { | |
3263 CharacterRange range = ranges->At(i >> 1); | |
3264 if (range.from() != special_class[i] || | |
3265 range.to() != special_class[i + 1] - 1) { | |
3266 return false; | |
3267 } | |
3268 } | |
3269 return true; | |
3270 } | |
3271 | |
3272 | |
3273 bool RegExpCharacterClass::is_standard() { | |
3274 // TODO(lrn): Remove need for this function, by not throwing away information | |
3275 // along the way. | |
3276 if (is_negated_) { | |
3277 return false; | |
3278 } | |
3279 if (set_.is_standard()) { | |
3280 return true; | |
3281 } | |
3282 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) { | |
3283 set_.set_standard_set_type('s'); | |
3284 return true; | |
3285 } | |
3286 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) { | |
3287 set_.set_standard_set_type('S'); | |
3288 return true; | |
3289 } | |
3290 if (CompareInverseRanges(set_.ranges(), | |
3291 kLineTerminatorRanges, | |
3292 kLineTerminatorRangeCount)) { | |
3293 set_.set_standard_set_type('.'); | |
3294 return true; | |
3295 } | |
3296 if (CompareRanges(set_.ranges(), | |
3297 kLineTerminatorRanges, | |
3298 kLineTerminatorRangeCount)) { | |
3299 set_.set_standard_set_type('n'); | |
3300 return true; | |
3301 } | |
3302 if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) { | |
3303 set_.set_standard_set_type('w'); | |
3304 return true; | |
3305 } | |
3306 if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) { | |
3307 set_.set_standard_set_type('W'); | |
3308 return true; | |
3309 } | |
3310 return false; | |
3311 } | |
3312 | |
3313 | |
3314 void TextNode::MakeCaseIndependent(bool is_ascii) { | |
3315 intptr_t element_count = elms_->length(); | |
3316 for (intptr_t i = 0; i < element_count; i++) { | |
3317 TextElement elm = elms_->At(i); | |
3318 if (elm.text_type() == TextElement::CHAR_CLASS) { | |
3319 RegExpCharacterClass* cc = elm.char_class(); | |
3320 // None of the standard character classes is different in the case | |
3321 // independent case and it slows us down if we don't know that. | |
3322 if (cc->is_standard()) continue; | |
3323 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges(); | |
3324 intptr_t range_count = ranges->length(); | |
3325 for (intptr_t j = 0; j < range_count; j++) { | |
3326 (*ranges)[j].AddCaseEquivalents(ranges, is_ascii, I); | |
3327 } | |
3328 } | |
3329 } | |
3330 } | |
3331 | |
3332 | |
3333 intptr_t TextNode::GreedyLoopTextLength() { | |
3334 TextElement elm = elms_->At(elms_->length() - 1); | |
3335 return elm.cp_offset() + elm.length(); | |
3336 } | |
3337 | |
3338 | |
3339 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( | |
3340 RegExpCompiler* compiler) { | |
3341 if (elms_->length() != 1) return NULL; | |
3342 TextElement elm = elms_->At(0); | |
3343 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL; | |
3344 RegExpCharacterClass* node = elm.char_class(); | |
3345 ZoneGrowableArray<CharacterRange>* ranges = node->ranges(); | |
3346 if (!CharacterRange::IsCanonical(ranges)) { | |
3347 CharacterRange::Canonicalize(ranges); | |
3348 } | |
3349 if (node->is_negated()) { | |
3350 return ranges->length() == 0 ? on_success() : NULL; | |
3351 } | |
3352 if (ranges->length() != 1) return NULL; | |
3353 uint32_t max_char; | |
3354 if (compiler->ascii()) { | |
3355 max_char = Symbols::kMaxOneCharCodeSymbol; | |
3356 } else { | |
3357 max_char = Utf16::kMaxCodeUnit; | |
3358 } | |
3359 return ranges->At(0).IsEverything(max_char) ? on_success() : NULL; | |
3360 } | |
3361 | |
3362 | |
3363 ActionNode* ActionNode::SetRegister(intptr_t reg, | |
3364 intptr_t val, | |
3365 RegExpNode* on_success) { | |
3366 ActionNode* result = | |
3367 new(on_success->isolate()) ActionNode(SET_REGISTER, on_success); | |
3368 result->data_.u_store_register.reg = reg; | |
3369 result->data_.u_store_register.value = val; | |
3370 return result; | |
3371 } | |
3372 | |
3373 | |
3374 ActionNode* ActionNode::IncrementRegister(intptr_t reg, | |
3375 RegExpNode* on_success) { | |
3376 ActionNode* result = | |
3377 new(on_success->isolate()) ActionNode(INCREMENT_REGISTER, on_success); | |
3378 result->data_.u_increment_register.reg = reg; | |
3379 return result; | |
3380 } | |
3381 | |
3382 | |
3383 ActionNode* ActionNode::StorePosition(intptr_t reg, | |
3384 bool is_capture, | |
3385 RegExpNode* on_success) { | |
3386 ActionNode* result = | |
3387 new(on_success->isolate()) ActionNode(STORE_POSITION, on_success); | |
3388 result->data_.u_position_register.reg = reg; | |
3389 result->data_.u_position_register.is_capture = is_capture; | |
3390 return result; | |
3391 } | |
3392 | |
3393 | |
3394 ActionNode* ActionNode::ClearCaptures(Interval range, | |
3395 RegExpNode* on_success) { | |
3396 ActionNode* result = | |
3397 new(on_success->isolate()) ActionNode(CLEAR_CAPTURES, on_success); | |
3398 result->data_.u_clear_captures.range_from = range.from(); | |
3399 result->data_.u_clear_captures.range_to = range.to(); | |
3400 return result; | |
3401 } | |
3402 | |
3403 | |
3404 ActionNode* ActionNode::BeginSubmatch(intptr_t stack_reg, | |
3405 intptr_t position_reg, | |
3406 RegExpNode* on_success) { | |
3407 ActionNode* result = | |
3408 new(on_success->isolate()) ActionNode(BEGIN_SUBMATCH, on_success); | |
3409 result->data_.u_submatch.stack_pointer_register = stack_reg; | |
3410 result->data_.u_submatch.current_position_register = position_reg; | |
3411 return result; | |
3412 } | |
3413 | |
3414 | |
3415 ActionNode* ActionNode::PositiveSubmatchSuccess(intptr_t stack_reg, | |
3416 intptr_t position_reg, | |
3417 intptr_t clear_register_count, | |
3418 intptr_t clear_register_from, | |
3419 RegExpNode* on_success) { | |
3420 ActionNode* result = | |
3421 new(on_success->isolate()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, | |
3422 on_success); | |
3423 result->data_.u_submatch.stack_pointer_register = stack_reg; | |
3424 result->data_.u_submatch.current_position_register = position_reg; | |
3425 result->data_.u_submatch.clear_register_count = clear_register_count; | |
3426 result->data_.u_submatch.clear_register_from = clear_register_from; | |
3427 return result; | |
3428 } | |
3429 | |
3430 | |
3431 ActionNode* ActionNode::EmptyMatchCheck(intptr_t start_register, | |
3432 intptr_t repetition_register, | |
3433 intptr_t repetition_limit, | |
3434 RegExpNode* on_success) { | |
3435 ActionNode* result = | |
3436 new(on_success->isolate()) ActionNode(EMPTY_MATCH_CHECK, on_success); | |
3437 result->data_.u_empty_match_check.start_register = start_register; | |
3438 result->data_.u_empty_match_check.repetition_register = repetition_register; | |
3439 result->data_.u_empty_match_check.repetition_limit = repetition_limit; | |
3440 return result; | |
3441 } | |
3442 | |
3443 | |
3444 TextElement TextElement::Atom(RegExpAtom* atom) { | |
3445 return TextElement(ATOM, atom); | |
3446 } | |
3447 | |
3448 | |
3449 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) { | |
3450 return TextElement(CHAR_CLASS, char_class); | |
3451 } | |
3452 | |
3453 | |
3454 intptr_t TextElement::length() const { | |
3455 switch (text_type()) { | |
3456 case ATOM: | |
3457 return atom()->length(); | |
3458 | |
3459 case CHAR_CLASS: | |
3460 return 1; | |
3461 } | |
3462 UNREACHABLE(); | |
3463 return 0; | |
3464 } | |
3465 | |
3466 | |
3467 static void AddClass(const intptr_t* elmv, | |
3468 intptr_t elmc, | |
3469 ZoneGrowableArray<CharacterRange>* ranges) { | |
3470 elmc--; | |
3471 ASSERT(elmv[elmc] == 0x10000); | |
3472 for (intptr_t i = 0; i < elmc; i += 2) { | |
3473 ASSERT(elmv[i] < elmv[i + 1]); | |
3474 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1)); | |
3475 } | |
3476 } | |
3477 | |
3478 | |
3479 static void AddClassNegated(const intptr_t *elmv, | |
3480 intptr_t elmc, | |
3481 ZoneGrowableArray<CharacterRange>* ranges) { | |
3482 elmc--; | |
3483 ASSERT(elmv[elmc] == 0x10000); | |
3484 ASSERT(elmv[0] != 0x0000); | |
3485 ASSERT(elmv[elmc-1] != Utf16::kMaxCodeUnit); | |
3486 uint16_t last = 0x0000; | |
3487 for (intptr_t i = 0; i < elmc; i += 2) { | |
3488 ASSERT(last <= elmv[i] - 1); | |
3489 ASSERT(elmv[i] < elmv[i + 1]); | |
3490 ranges->Add(CharacterRange(last, elmv[i] - 1)); | |
3491 last = elmv[i + 1]; | |
3492 } | |
3493 ranges->Add(CharacterRange(last, Utf16::kMaxCodeUnit)); | |
3494 } | |
3495 | |
3496 | |
3497 void CharacterRange::AddClassEscape(uint16_t type, | |
3498 ZoneGrowableArray<CharacterRange>* ranges) { | |
3499 switch (type) { | |
3500 case 's': | |
3501 AddClass(kSpaceRanges, kSpaceRangeCount, ranges); | |
3502 break; | |
3503 case 'S': | |
3504 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges); | |
3505 break; | |
3506 case 'w': | |
3507 AddClass(kWordRanges, kWordRangeCount, ranges); | |
3508 break; | |
3509 case 'W': | |
3510 AddClassNegated(kWordRanges, kWordRangeCount, ranges); | |
3511 break; | |
3512 case 'd': | |
3513 AddClass(kDigitRanges, kDigitRangeCount, ranges); | |
3514 break; | |
3515 case 'D': | |
3516 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges); | |
3517 break; | |
3518 case '.': | |
3519 AddClassNegated(kLineTerminatorRanges, | |
3520 kLineTerminatorRangeCount, | |
3521 ranges); | |
3522 break; | |
3523 // This is not a character range as defined by the spec but a | |
3524 // convenient shorthand for a character class that matches any | |
3525 // character. | |
3526 case '*': | |
3527 ranges->Add(CharacterRange::Everything()); | |
3528 break; | |
3529 // This is the set of characters matched by the $ and ^ symbols | |
3530 // in multiline mode. | |
3531 case 'n': | |
3532 AddClass(kLineTerminatorRanges, | |
3533 kLineTerminatorRangeCount, | |
3534 ranges); | |
3535 break; | |
3536 default: | |
3537 UNREACHABLE(); | |
3538 } | |
3539 } | |
3540 | |
3541 | |
3542 // Move a number of elements in a zonelist to another position | |
3543 // in the same list. Handles overlapping source and target areas. | |
3544 static void MoveRanges(ZoneGrowableArray<CharacterRange>* list, | |
3545 intptr_t from, | |
3546 intptr_t to, | |
3547 intptr_t count) { | |
3548 // Ranges are potentially overlapping. | |
3549 if (from < to) { | |
3550 for (intptr_t i = count - 1; i >= 0; i--) { | |
3551 (*list)[to + i] = list->At(from + i); | |
3552 } | |
3553 } else { | |
3554 for (intptr_t i = 0; i < count; i++) { | |
3555 (*list)[to + i] = list->At(from + i); | |
3556 } | |
3557 } | |
3558 } | |
3559 | |
3560 | |
3561 static intptr_t InsertRangeInCanonicalList( | |
3562 ZoneGrowableArray<CharacterRange>* list, | |
3563 intptr_t count, | |
3564 CharacterRange insert) { | |
3565 // Inserts a range into list[0..count[, which must be sorted | |
3566 // by from value and non-overlapping and non-adjacent, using at most | |
3567 // list[0..count] for the result. Returns the number of resulting | |
3568 // canonicalized ranges. Inserting a range may collapse existing ranges into | |
3569 // fewer ranges, so the return value can be anything in the range 1..count+1. | |
3570 uint16_t from = insert.from(); | |
3571 uint16_t to = insert.to(); | |
3572 intptr_t start_pos = 0; | |
3573 intptr_t end_pos = count; | |
3574 for (intptr_t i = count - 1; i >= 0; i--) { | |
3575 CharacterRange current = list->At(i); | |
3576 if (current.from() > to + 1) { | |
3577 end_pos = i; | |
3578 } else if (current.to() + 1 < from) { | |
3579 start_pos = i + 1; | |
3580 break; | |
3581 } | |
3582 } | |
3583 | |
3584 // Inserted range overlaps, or is adjacent to, ranges at positions | |
3585 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are | |
3586 // not affected by the insertion. | |
3587 // If start_pos == end_pos, the range must be inserted before start_pos. | |
3588 // if start_pos < end_pos, the entire range from start_pos to end_pos | |
3589 // must be merged with the insert range. | |
3590 | |
3591 if (start_pos == end_pos) { | |
3592 // Insert between existing ranges at position start_pos. | |
3593 if (start_pos < count) { | |
3594 MoveRanges(list, start_pos, start_pos + 1, count - start_pos); | |
3595 } | |
3596 (*list)[start_pos] = insert; | |
3597 return count + 1; | |
3598 } | |
3599 if (start_pos + 1 == end_pos) { | |
3600 // Replace single existing range at position start_pos. | |
3601 CharacterRange to_replace = list->At(start_pos); | |
3602 intptr_t new_from = Utils::Minimum(to_replace.from(), from); | |
3603 intptr_t new_to = Utils::Maximum(to_replace.to(), to); | |
3604 (*list)[start_pos] = CharacterRange(new_from, new_to); | |
3605 return count; | |
3606 } | |
3607 // Replace a number of existing ranges from start_pos to end_pos - 1. | |
3608 // Move the remaining ranges down. | |
3609 | |
3610 intptr_t new_from = Utils::Minimum(list->At(start_pos).from(), from); | |
3611 intptr_t new_to = Utils::Maximum(list->At(end_pos - 1).to(), to); | |
3612 if (end_pos < count) { | |
3613 MoveRanges(list, end_pos, start_pos + 1, count - end_pos); | |
3614 } | |
3615 (*list)[start_pos] = CharacterRange(new_from, new_to); | |
3616 return count - (end_pos - start_pos) + 1; | |
3617 } | |
3618 | |
3619 | |
3620 bool CharacterRange::IsCanonical(ZoneGrowableArray<CharacterRange>* ranges) { | |
3621 ASSERT(ranges != NULL); | |
3622 intptr_t n = ranges->length(); | |
3623 if (n <= 1) return true; | |
3624 intptr_t max = ranges->At(0).to(); | |
3625 for (intptr_t i = 1; i < n; i++) { | |
3626 CharacterRange next_range = ranges->At(i); | |
3627 if (next_range.from() <= max + 1) return false; | |
3628 max = next_range.to(); | |
3629 } | |
3630 return true; | |
3631 } | |
3632 | |
3633 | |
3634 void CharacterRange::Canonicalize( | |
3635 ZoneGrowableArray<CharacterRange>* character_ranges) { | |
3636 if (character_ranges->length() <= 1) return; | |
3637 // Check whether ranges are already canonical (increasing, non-overlapping, | |
3638 // non-adjacent). | |
3639 intptr_t n = character_ranges->length(); | |
3640 intptr_t max = character_ranges->At(0).to(); | |
3641 intptr_t i = 1; | |
3642 while (i < n) { | |
3643 CharacterRange current = character_ranges->At(i); | |
3644 if (current.from() <= max + 1) { | |
3645 break; | |
3646 } | |
3647 max = current.to(); | |
3648 i++; | |
3649 } | |
3650 // Canonical until the i'th range. If that's all of them, we are done. | |
3651 if (i == n) return; | |
3652 | |
3653 // The ranges at index i and forward are not canonicalized. Make them so by | |
3654 // doing the equivalent of insertion sort (inserting each into the previous | |
3655 // list, in order). | |
3656 // Notice that inserting a range can reduce the number of ranges in the | |
3657 // result due to combining of adjacent and overlapping ranges. | |
3658 intptr_t read = i; // Range to insert. | |
3659 intptr_t num_canonical = i; // Length of canonicalized part of list. | |
3660 do { | |
3661 num_canonical = InsertRangeInCanonicalList(character_ranges, | |
3662 num_canonical, | |
3663 character_ranges->At(read)); | |
3664 read++; | |
3665 } while (read < n); | |
3666 character_ranges->TruncateTo(num_canonical); | |
3667 | |
3668 ASSERT(CharacterRange::IsCanonical(character_ranges)); | |
3669 } | |
3670 | |
3671 | |
3672 void CharacterSet::Canonicalize() { | |
3673 // Special/default classes are always considered canonical. The result | |
3674 // of calling ranges() will be sorted. | |
3675 if (ranges_ == NULL) return; | |
3676 CharacterRange::Canonicalize(ranges_); | |
3677 } | |
3678 | |
3679 | |
3680 ZoneGrowableArray<CharacterRange>* CharacterSet::ranges() { | |
3681 if (ranges_ == NULL) { | |
3682 ranges_ = new ZoneGrowableArray<CharacterRange>(2); | |
3683 CharacterRange::AddClassEscape(standard_set_type_, ranges_); | |
3684 } | |
3685 return ranges_; | |
3686 } | |
3687 | |
3688 | |
3689 void CharacterRange::Negate(ZoneGrowableArray<CharacterRange>* ranges, | |
3690 ZoneGrowableArray<CharacterRange>* negated_ranges) { | |
3691 ASSERT(CharacterRange::IsCanonical(ranges)); | |
3692 ASSERT(negated_ranges->length() == 0); | |
3693 intptr_t range_count = ranges->length(); | |
3694 uint16_t from = 0; | |
3695 intptr_t i = 0; | |
3696 if (range_count > 0 && ranges->At(0).from() == 0) { | |
3697 from = ranges->At(0).to(); | |
3698 i = 1; | |
3699 } | |
3700 while (i < range_count) { | |
3701 CharacterRange range = ranges->At(i); | |
3702 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1)); | |
3703 from = range.to(); | |
3704 i++; | |
3705 } | |
3706 if (from < Utf16::kMaxCodeUnit) { | |
3707 negated_ranges->Add(CharacterRange(from + 1, Utf16::kMaxCodeUnit)); | |
3708 } | |
3709 } | |
3710 | |
3711 | |
3712 void CharacterRange::AddCaseEquivalents( | |
3713 ZoneGrowableArray<CharacterRange>* ranges, | |
3714 bool is_ascii, | |
3715 Isolate* isolate) { | |
3716 uint16_t bottom = from(); | |
3717 uint16_t top = to(); | |
3718 if (is_ascii && !RangeContainsLatin1Equivalents(*this)) { | |
3719 if (bottom > Symbols::kMaxOneCharCodeSymbol) return; | |
3720 if (top > Symbols::kMaxOneCharCodeSymbol) { | |
3721 top = Symbols::kMaxOneCharCodeSymbol; | |
3722 } | |
3723 } | |
3724 | |
3725 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize; | |
3726 unibrow::Mapping<unibrow::CanonicalizationRange> jsregexp_canonrange; | |
3727 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | |
3728 if (top == bottom) { | |
3729 // If this is a singleton we just expand the one character. | |
3730 intptr_t length = jsregexp_uncanonicalize.get(bottom, '\0', chars); // NOLIN T | |
3731 for (intptr_t i = 0; i < length; i++) { | |
3732 uint32_t chr = chars[i]; | |
3733 if (chr != bottom) { | |
3734 ranges->Add(CharacterRange::Singleton(chars[i])); | |
3735 } | |
3736 } | |
3737 } else { | |
3738 // If this is a range we expand the characters block by block, | |
3739 // expanding contiguous subranges (blocks) one at a time. | |
3740 // The approach is as follows. For a given start character we | |
3741 // look up the remainder of the block that contains it (represented | |
3742 // by the end point), for instance we find 'z' if the character | |
3743 // is 'c'. A block is characterized by the property | |
3744 // that all characters uncanonicalize in the same way, except that | |
3745 // each entry in the result is incremented by the distance from the first | |
3746 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and // NOLINT | |
3747 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. | |
3748 // Once we've found the end point we look up its uncanonicalization | |
3749 // and produce a range for each element. For instance for [c-f] | |
3750 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only | |
3751 // add a range if it is not already contained in the input, so [c-f] | |
3752 // will be skipped but [C-F] will be added. If this range is not | |
3753 // completely contained in a block we do this for all the blocks | |
3754 // covered by the range (handling characters that is not in a block | |
3755 // as a "singleton block"). | |
3756 int32_t range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | |
3757 intptr_t pos = bottom; | |
3758 while (pos <= top) { | |
3759 intptr_t length = jsregexp_canonrange.get(pos, '\0', range); | |
3760 uint16_t block_end; | |
3761 if (length == 0) { | |
3762 block_end = pos; | |
3763 } else { | |
3764 ASSERT(length == 1); | |
3765 block_end = range[0]; | |
3766 } | |
3767 intptr_t end = (block_end > top) ? top : block_end; | |
3768 length = jsregexp_uncanonicalize.get(block_end, '\0', range); // NOLINT | |
3769 for (intptr_t i = 0; i < length; i++) { | |
3770 uint32_t c = range[i]; | |
3771 uint16_t range_from = c - (block_end - pos); | |
3772 uint16_t range_to = c - (block_end - end); | |
3773 if (!(bottom <= range_from && range_to <= top)) { | |
3774 ranges->Add(CharacterRange(range_from, range_to)); | |
3775 } | |
3776 } | |
3777 pos = end + 1; | |
3778 } | |
3779 } | |
3780 } | |
3781 | |
3782 | |
3783 // ------------------------------------------------------------------- | |
3784 // Splay tree | |
3785 | |
3786 | |
3787 // Workaround for the fact that ZoneGrowableArray does not have contains(). | |
3788 static bool ArrayContains(ZoneGrowableArray<unsigned>* array, | |
3789 unsigned value) { | |
3790 for (intptr_t i = 0; i < array->length(); i++) { | |
3791 if (array->At(i) == value) { | |
3792 return true; | |
3793 } | |
3794 } | |
3795 return false; | |
3796 } | |
3797 | |
3798 | |
3799 void OutSet::Set(unsigned value, Isolate* isolate) { | |
3800 if (value < kFirstLimit) { | |
3801 first_ |= (1 << value); | |
3802 } else { | |
3803 if (remaining_ == NULL) | |
3804 remaining_ = new(isolate) ZoneGrowableArray<unsigned>(1); | |
3805 | |
3806 bool remaining_contains_value = ArrayContains(remaining_, value); | |
3807 if (remaining_->is_empty() || !remaining_contains_value) { | |
3808 remaining_->Add(value); | |
3809 } | |
3810 } | |
3811 } | |
3812 | |
3813 | |
3814 bool OutSet::Get(unsigned value) const { | |
3815 if (value < kFirstLimit) { | |
3816 return (first_ & (1 << value)) != 0; | |
3817 } else if (remaining_ == NULL) { | |
3818 return false; | |
3819 } else { | |
3820 return ArrayContains(remaining_, value); | |
3821 } | |
3822 } | |
3823 | |
3824 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, | |
3825 RegExpCompiler* compiler, | |
3826 intptr_t filled_in, | |
3827 bool not_at_start) { | |
3828 if (assertion_type_ == AT_START && not_at_start) { | |
3829 details->set_cannot_match(); | |
3830 return; | |
3831 } | |
3832 return on_success()->GetQuickCheckDetails(details, | |
3833 compiler, | |
3834 filled_in, | |
3835 not_at_start); | |
3836 } | |
3837 | |
3838 | |
3839 void GuardedAlternative::AddGuard(Guard* guard, Isolate* isolate) { | |
3840 if (guards_ == NULL) | |
3841 guards_ = new(isolate) ZoneGrowableArray<Guard*>(1); | |
3842 guards_->Add(guard); | |
3843 } | |
3844 | |
3845 | |
3846 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { | |
3847 ASSERT(loop_node_ == NULL); | |
3848 AddAlternative(alt); | |
3849 loop_node_ = alt.node(); | |
3850 } | |
3851 | |
3852 | |
3853 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { | |
3854 ASSERT(continue_node_ == NULL); | |
3855 AddAlternative(alt); | |
3856 continue_node_ = alt.node(); | |
3857 } | |
3858 | |
3859 | |
3860 void LoopChoiceNode::Accept(NodeVisitor* visitor) { | |
3861 visitor->VisitLoopChoice(this); | |
3862 } | |
3863 | |
3864 | |
3865 intptr_t LoopChoiceNode::EatsAtLeast(intptr_t still_to_find, | |
3866 intptr_t budget, | |
3867 bool not_at_start) { | |
3868 return EatsAtLeastHelper(still_to_find, | |
3869 budget - 1, | |
3870 loop_node_, | |
3871 not_at_start); | |
3872 } | |
3873 | |
3874 | |
3875 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, | |
3876 RegExpCompiler* compiler, | |
3877 intptr_t characters_filled_in, | |
3878 bool not_at_start) { | |
3879 not_at_start = (not_at_start || not_at_start_); | |
3880 intptr_t choice_count = alternatives_->length(); | |
3881 ASSERT(choice_count > 0); | |
3882 (*alternatives_)[0].node()->GetQuickCheckDetails(details, | |
3883 compiler, | |
3884 characters_filled_in, | |
3885 not_at_start); | |
3886 for (intptr_t i = 1; i < choice_count; i++) { | |
3887 QuickCheckDetails new_details(details->characters()); | |
3888 RegExpNode* node = (*alternatives_)[i].node(); | |
3889 node->GetQuickCheckDetails(&new_details, compiler, | |
3890 characters_filled_in, | |
3891 not_at_start); | |
3892 // Here we merge the quick match details of the two branches. | |
3893 details->Merge(&new_details, characters_filled_in); | |
3894 } | |
3895 } | |
3896 | |
3897 | |
3898 void QuickCheckDetails::Clear() { | |
3899 for (intptr_t i = 0; i < characters_; i++) { | |
3900 positions_[i].mask = 0; | |
3901 positions_[i].value = 0; | |
3902 positions_[i].determines_perfectly = false; | |
3903 } | |
3904 characters_ = 0; | |
3905 } | |
3906 | |
3907 | |
3908 void QuickCheckDetails::Advance(intptr_t by, bool ascii) { | |
3909 ASSERT(by >= 0); | |
3910 if (by >= characters_) { | |
3911 Clear(); | |
3912 return; | |
3913 } | |
3914 for (intptr_t i = 0; i < characters_ - by; i++) { | |
3915 positions_[i] = positions_[by + i]; | |
3916 } | |
3917 for (intptr_t i = characters_ - by; i < characters_; i++) { | |
3918 positions_[i].mask = 0; | |
3919 positions_[i].value = 0; | |
3920 positions_[i].determines_perfectly = false; | |
3921 } | |
3922 characters_ -= by; | |
3923 // We could change mask_ and value_ here but we would never advance unless | |
3924 // they had already been used in a check and they won't be used again because | |
3925 // it would gain us nothing. So there's no point. | |
3926 } | |
3927 | |
3928 | |
3929 void QuickCheckDetails::Merge(QuickCheckDetails* other, intptr_t from_index) { | |
3930 ASSERT(characters_ == other->characters_); | |
3931 if (other->cannot_match_) { | |
3932 return; | |
3933 } | |
3934 if (cannot_match_) { | |
3935 *this = *other; | |
3936 return; | |
3937 } | |
3938 for (intptr_t i = from_index; i < characters_; i++) { | |
3939 QuickCheckDetails::Position* pos = positions(i); | |
3940 QuickCheckDetails::Position* other_pos = other->positions(i); | |
3941 if (pos->mask != other_pos->mask || | |
3942 pos->value != other_pos->value || | |
3943 !other_pos->determines_perfectly) { | |
3944 // Our mask-compare operation will be approximate unless we have the | |
3945 // exact same operation on both sides of the alternation. | |
3946 pos->determines_perfectly = false; | |
3947 } | |
3948 pos->mask &= other_pos->mask; | |
3949 pos->value &= pos->mask; | |
3950 other_pos->value &= pos->mask; | |
3951 uint16_t differing_bits = (pos->value ^ other_pos->value); | |
3952 pos->mask &= ~differing_bits; | |
3953 pos->value &= pos->mask; | |
3954 } | |
3955 } | |
3956 | |
3957 | |
3958 // Finds the fixed match length of a sequence of nodes that goes from | |
3959 // this alternative and back to this choice node. If there are variable | |
3960 // length nodes or other complications in the way then return a sentinel | |
3961 // value indicating that a greedy loop cannot be constructed. | |
3962 intptr_t ChoiceNode::GreedyLoopTextLengthForAlternative( | |
3963 GuardedAlternative* alternative) { | |
3964 intptr_t length = 0; | |
3965 RegExpNode* node = alternative->node(); | |
3966 // Later we will generate code for all these text nodes using recursion | |
3967 // so we have to limit the max number. | |
3968 intptr_t recursion_depth = 0; | |
3969 while (node != this) { | |
3970 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { | |
3971 return kNodeIsTooComplexForGreedyLoops; | |
3972 } | |
3973 intptr_t node_length = node->GreedyLoopTextLength(); | |
3974 if (node_length == kNodeIsTooComplexForGreedyLoops) { | |
3975 return kNodeIsTooComplexForGreedyLoops; | |
3976 } | |
3977 length += node_length; | |
3978 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); | |
3979 node = seq_node->on_success(); | |
3980 } | |
3981 return length; | |
3982 } | |
3983 | |
3984 | |
3985 void NegativeLookaheadChoiceNode::GetQuickCheckDetails( | |
3986 QuickCheckDetails* details, | |
3987 RegExpCompiler* compiler, | |
3988 intptr_t filled_in, | |
3989 bool not_at_start) { | |
3990 // Alternative 0 is the negative lookahead, alternative 1 is what comes | |
3991 // afterwards. | |
3992 RegExpNode* node = (*alternatives_)[1].node(); | |
3993 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); | |
3994 } | |
3995 | |
3996 | |
3997 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { | |
3998 return RegExpEngine::CompilationResult("RegExp too big"); | |
3999 } | |
4000 | |
4001 | |
4002 // Attempts to compile the regexp using an Irregexp code generator. Returns | |
4003 // a fixed array or a null handle depending on whether it succeeded. | |
4004 RegExpCompiler::RegExpCompiler(intptr_t capture_count, bool ignore_case, | |
4005 intptr_t specialization_cid) | |
4006 : next_register_(2 * (capture_count + 1)), | |
4007 work_list_(NULL), | |
4008 recursion_depth_(0), | |
4009 ignore_case_(ignore_case), | |
4010 specialization_cid_(specialization_cid), | |
4011 reg_exp_too_big_(false), | |
4012 current_expansion_factor_(1), | |
4013 isolate_(Isolate::Current()) { | |
4014 accept_ = new(I) EndNode(EndNode::ACCEPT, I); | |
4015 } | |
4016 | |
4017 | |
4018 RegExpEngine::CompilationResult RegExpCompiler::Assemble( | |
4019 IRRegExpMacroAssembler* macro_assembler, | |
4020 RegExpNode* start, | |
4021 intptr_t capture_count, | |
4022 const String& pattern) { | |
4023 static const bool use_slow_safe_regexp_compiler = false; | |
4024 | |
4025 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler); | |
4026 macro_assembler_ = macro_assembler; | |
4027 | |
4028 ZoneGrowableArray<RegExpNode*> work_list(0); | |
4029 work_list_ = &work_list; | |
4030 BlockLabel fail; | |
4031 macro_assembler_->PushBacktrack(&fail); | |
4032 Trace new_trace; | |
4033 start->Emit(this, &new_trace); | |
4034 macro_assembler_->BindBlock(&fail); | |
4035 macro_assembler_->Fail(); | |
4036 while (!work_list.is_empty()) { | |
4037 work_list.RemoveLast()->Emit(this, &new_trace); | |
4038 } | |
4039 if (reg_exp_too_big_) return IrregexpRegExpTooBig(); | |
4040 | |
4041 macro_assembler->FinalizeIndirectGotos(); | |
4042 | |
4043 return RegExpEngine::CompilationResult(macro_assembler, | |
4044 macro_assembler->graph_entry(), | |
4045 macro_assembler->num_blocks(), | |
4046 macro_assembler->num_stack_locals()); | |
4047 } | |
4048 | |
4049 | |
4050 RegExpEngine::CompilationResult RegExpEngine::Compile( | |
4051 RegExpCompileData* data, | |
4052 const ParsedFunction* parsed_function, | |
4053 const ZoneGrowableArray<const ICData*>& ic_data_array) { | |
4054 Isolate* isolate = Isolate::Current(); | |
4055 | |
4056 const Function& function = parsed_function->function(); | |
4057 const intptr_t specialization_cid = function.regexp_cid(); | |
4058 const bool is_ascii = (specialization_cid == kOneByteStringCid || | |
4059 specialization_cid == kExternalOneByteStringCid); | |
4060 JSRegExp& regexp = JSRegExp::Handle(isolate, function.regexp()); | |
4061 const String& pattern = String::Handle(isolate, regexp.pattern()); | |
4062 | |
4063 ASSERT(!regexp.IsNull()); | |
4064 ASSERT(!pattern.IsNull()); | |
4065 | |
4066 const bool ignore_case = regexp.is_ignore_case(); | |
4067 const bool is_global = regexp.is_global(); | |
4068 | |
4069 RegExpCompiler compiler(data->capture_count, ignore_case, specialization_cid); | |
4070 | |
4071 // TODO(jgruber): Frequency sampling is currently disabled because of several | |
4072 // issues. We do not want to store subject strings in the regexp object since | |
4073 // they might be long and we should not prevent their garbage collection. | |
4074 // Passing them to this function explicitly does not help, since we must | |
4075 // generate exactly the same IR for both the unoptimizing and optimizing | |
4076 // pipelines (otherwise it gets confused when i.e. deopt id's differ). | |
4077 // An option would be to store sampling results in the regexp object, but | |
4078 // I'm not sure the performance gains are relevant enough. | |
4079 | |
4080 // Wrap the body of the regexp in capture #0. | |
4081 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, | |
4082 0, | |
4083 &compiler, | |
4084 compiler.accept()); | |
4085 | |
4086 RegExpNode* node = captured_body; | |
4087 bool is_start_anchored = data->tree->IsAnchoredAtStart(); | |
4088 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); | |
4089 intptr_t max_length = data->tree->max_match(); | |
4090 if (!is_start_anchored) { | |
4091 // Add a .*? at the beginning, outside the body capture, unless | |
4092 // this expression is anchored at the beginning. | |
4093 RegExpNode* loop_node = | |
4094 RegExpQuantifier::ToNode(0, | |
4095 RegExpTree::kInfinity, | |
4096 false, | |
4097 new(isolate) RegExpCharacterClass('*'), | |
4098 &compiler, | |
4099 captured_body, | |
4100 data->contains_anchor); | |
4101 | |
4102 if (data->contains_anchor) { | |
4103 // Unroll loop once, to take care of the case that might start | |
4104 // at the start of input. | |
4105 ChoiceNode* first_step_node = new(isolate) ChoiceNode(2, isolate); | |
4106 first_step_node->AddAlternative(GuardedAlternative(captured_body)); | |
4107 first_step_node->AddAlternative(GuardedAlternative( | |
4108 new(isolate) TextNode( | |
4109 new(isolate) RegExpCharacterClass('*'), loop_node))); | |
4110 node = first_step_node; | |
4111 } else { | |
4112 node = loop_node; | |
4113 } | |
4114 } | |
4115 if (is_ascii) { | |
4116 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case); | |
4117 // Do it again to propagate the new nodes to places where they were not | |
4118 // put because they had not been calculated yet. | |
4119 if (node != NULL) { | |
4120 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case); | |
4121 } | |
4122 } | |
4123 | |
4124 if (node == NULL) node = new(isolate) EndNode(EndNode::BACKTRACK, isolate); | |
4125 data->node = node; | |
4126 Analysis analysis(ignore_case, is_ascii); | |
4127 analysis.EnsureAnalyzed(node); | |
4128 if (analysis.has_failed()) { | |
4129 const char* error_message = analysis.error_message(); | |
4130 return CompilationResult(error_message); | |
4131 } | |
4132 | |
4133 // Native regexp implementation. | |
4134 | |
4135 IRRegExpMacroAssembler* macro_assembler = | |
4136 new(isolate) IRRegExpMacroAssembler(specialization_cid, | |
4137 data->capture_count, | |
4138 parsed_function, | |
4139 ic_data_array, | |
4140 isolate); | |
4141 | |
4142 // Inserted here, instead of in Assembler, because it depends on information | |
4143 // in the AST that isn't replicated in the Node structure. | |
4144 static const intptr_t kMaxBacksearchLimit = 1024; | |
4145 if (is_end_anchored && | |
4146 !is_start_anchored && | |
4147 max_length < kMaxBacksearchLimit) { | |
4148 macro_assembler->SetCurrentPositionFromEnd(max_length); | |
4149 } | |
4150 | |
4151 if (is_global) { | |
4152 macro_assembler->set_global_mode( | |
4153 (data->tree->min_match() > 0) | |
4154 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK | |
4155 : RegExpMacroAssembler::GLOBAL); | |
4156 } | |
4157 | |
4158 RegExpEngine::CompilationResult result = | |
4159 compiler.Assemble(macro_assembler, | |
4160 node, | |
4161 data->capture_count, | |
4162 pattern); | |
4163 | |
4164 if (FLAG_trace_irregexp) { | |
4165 macro_assembler->PrintBlocks(); | |
4166 } | |
4167 | |
4168 return result; | |
4169 } | |
4170 | |
4171 | |
4172 static void CreateSpecializedFunction(Isolate* isolate, | |
4173 JSRegExp* regexp, | |
Ivan Posva
2014/10/10 06:35:54
const JSRegExp&
jgruber1
2014/10/10 09:29:44
Done.
| |
4174 intptr_t specialization_cid, | |
4175 const Object& owner) { | |
4176 const intptr_t kParamCount = RegExpMacroAssembler::kParamCount; | |
4177 | |
4178 Function& fn = Function::Handle(isolate, | |
4179 Function::New(String::Handle(isolate, Symbols::New("RegExp")), | |
4180 RawFunction::kIrregexpFunction, | |
4181 true, // Static. | |
4182 false, // Not const. | |
4183 false, // Not abstract. | |
4184 false, // Not external. | |
4185 false, // Not native. | |
4186 owner, | |
4187 0)); // Requires a non-negative token position. | |
4188 | |
4189 fn.set_num_fixed_parameters(kParamCount); | |
4190 fn.set_parameter_types( | |
Ivan Posva
2014/10/10 06:35:54
These parameter arrays should be allocated only on
jgruber1
2014/10/10 09:29:44
Where should they be cached? FYI, CreateMethodExtr
Vyacheslav Egorov (Google)
2014/10/10 09:43:25
For starters they could be cached outside of this
| |
4191 Array::Handle(isolate, Array::New(kParamCount, Heap::kOld))); | |
4192 fn.set_parameter_names( | |
4193 Array::Handle(isolate, Array::New(kParamCount, Heap::kOld))); | |
4194 fn.SetParameterTypeAt(0, Type::Handle(isolate, Type::DynamicType())); | |
4195 fn.SetParameterNameAt(0, Symbols::string_param_()); | |
4196 fn.SetParameterTypeAt(1, Type::Handle(isolate, Type::DynamicType())); | |
4197 fn.SetParameterNameAt(1, Symbols::start_index_param_()); | |
4198 fn.set_result_type(Type::Handle(isolate, Type::ArrayType())); | |
4199 | |
4200 // Cache the result. | |
4201 regexp->set_function(specialization_cid, fn); | |
4202 | |
4203 fn.set_regexp(*regexp); | |
4204 fn.set_regexp_cid(specialization_cid); | |
4205 | |
4206 // The function is compiled lazily during the first call. | |
4207 } | |
4208 | |
4209 | |
4210 RawJSRegExp* RegExpEngine::CreateJSRegExp(Isolate* isolate, | |
4211 const String& pattern, | |
4212 bool multi_line, | |
4213 bool ignore_case) { | |
4214 JSRegExp& regexp = JSRegExp::Handle(JSRegExp::New(0)); | |
4215 | |
4216 regexp.set_pattern(pattern); | |
4217 | |
4218 if (multi_line) regexp.set_is_multi_line(); | |
Ivan Posva
2014/10/10 06:35:54
if (cond) {
statements;
}
jgruber1
2014/10/10 09:29:44
Done.
| |
4219 if (ignore_case) regexp.set_is_ignore_case(); | |
4220 | |
4221 // TODO(jgruber): We might want to use normal string searching algorithms | |
4222 // for simple patterns. | |
4223 regexp.set_is_complex(); | |
4224 regexp.set_is_global(); // All dart regexps are global. | |
4225 | |
4226 const Library& lib = Library::Handle(isolate, Library::CoreLibrary()); | |
4227 const Class& owner = Class::Handle(isolate, | |
4228 lib.LookupClass(String::Handle(isolate, Symbols::New("RegExp")))); | |
4229 | |
4230 CreateSpecializedFunction(isolate, ®exp, kOneByteStringCid, owner); | |
4231 CreateSpecializedFunction(isolate, ®exp, kTwoByteStringCid, owner); | |
4232 CreateSpecializedFunction(isolate, ®exp, kExternalOneByteStringCid, owner); | |
4233 CreateSpecializedFunction(isolate, ®exp, kExternalTwoByteStringCid, owner); | |
4234 | |
4235 return regexp.raw(); | |
4236 } | |
4237 | |
4238 | |
4239 void BoyerMoorePositionInfo::Set(intptr_t character) { | |
4240 SetInterval(Interval(character, character)); | |
4241 } | |
4242 | |
4243 | |
4244 ContainedInLattice AddRange(ContainedInLattice containment, | |
4245 const intptr_t* ranges, | |
4246 intptr_t ranges_length, | |
4247 Interval new_range) { | |
4248 ASSERT((ranges_length & 1) == 1); | |
4249 ASSERT(ranges[ranges_length - 1] == Utf16::kMaxCodeUnit + 1); | |
4250 if (containment == kLatticeUnknown) return containment; | |
4251 bool inside = false; | |
4252 intptr_t last = 0; | |
4253 for (intptr_t i = 0; i < ranges_length; | |
4254 inside = !inside, last = ranges[i], i++) { | |
4255 // Consider the range from last to ranges[i]. | |
4256 // We haven't got to the new range yet. | |
4257 if (ranges[i] <= new_range.from()) continue; | |
4258 // New range is wholly inside last-ranges[i]. Note that new_range.to() is | |
4259 // inclusive, but the values in ranges are not. | |
4260 if (last <= new_range.from() && new_range.to() < ranges[i]) { | |
4261 return Combine(containment, inside ? kLatticeIn : kLatticeOut); | |
4262 } | |
4263 return kLatticeUnknown; | |
4264 } | |
4265 return containment; | |
4266 } | |
4267 | |
4268 | |
4269 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { | |
4270 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval); | |
4271 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); | |
4272 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval); | |
4273 surrogate_ = | |
4274 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval); | |
4275 if (interval.to() - interval.from() >= kMapSize - 1) { | |
4276 if (map_count_ != kMapSize) { | |
4277 map_count_ = kMapSize; | |
4278 for (intptr_t i = 0; i < kMapSize; i++) (*map_)[i] = true; | |
4279 } | |
4280 return; | |
4281 } | |
4282 for (intptr_t i = interval.from(); i <= interval.to(); i++) { | |
4283 intptr_t mod_character = (i & kMask); | |
4284 if (!map_->At(mod_character)) { | |
4285 map_count_++; | |
4286 (*map_)[mod_character] = true; | |
4287 } | |
4288 if (map_count_ == kMapSize) return; | |
4289 } | |
4290 } | |
4291 | |
4292 | |
4293 void BoyerMoorePositionInfo::SetAll() { | |
4294 s_ = w_ = d_ = kLatticeUnknown; | |
4295 if (map_count_ != kMapSize) { | |
4296 map_count_ = kMapSize; | |
4297 for (intptr_t i = 0; i < kMapSize; i++) (*map_)[i] = true; | |
4298 } | |
4299 } | |
4300 | |
4301 | |
4302 BoyerMooreLookahead::BoyerMooreLookahead( | |
4303 intptr_t length, RegExpCompiler* compiler, Isolate* isolate) | |
4304 : length_(length), | |
4305 compiler_(compiler) { | |
4306 if (compiler->ascii()) { | |
4307 max_char_ = Symbols::kMaxOneCharCodeSymbol; | |
4308 } else { | |
4309 max_char_ = Utf16::kMaxCodeUnit; | |
4310 } | |
4311 bitmaps_ = new(isolate) ZoneGrowableArray<BoyerMoorePositionInfo*>(length); | |
4312 for (intptr_t i = 0; i < length; i++) { | |
4313 bitmaps_->Add(new(isolate) BoyerMoorePositionInfo(isolate)); | |
4314 } | |
4315 } | |
4316 | |
4317 | |
4318 // Take all the characters that will not prevent a successful match if they | |
4319 // occur in the subject string in the range between min_lookahead and | |
4320 // max_lookahead (inclusive) measured from the current position. If the | |
4321 // character at max_lookahead offset is not one of these characters, then we | |
4322 // can safely skip forwards by the number of characters in the range. | |
4323 intptr_t BoyerMooreLookahead::GetSkipTable( | |
4324 intptr_t min_lookahead, | |
4325 intptr_t max_lookahead, | |
4326 const TypedData& boolean_skip_table) { | |
4327 const intptr_t kSize = RegExpMacroAssembler::kTableSize; | |
4328 | |
4329 const intptr_t kSkipArrayEntry = 0; | |
4330 const intptr_t kDontSkipArrayEntry = 1; | |
4331 | |
4332 for (intptr_t i = 0; i < kSize; i++) { | |
4333 boolean_skip_table.SetUint8(i, kSkipArrayEntry); | |
4334 } | |
4335 intptr_t skip = max_lookahead + 1 - min_lookahead; | |
4336 | |
4337 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) { | |
4338 BoyerMoorePositionInfo* map = bitmaps_->At(i); | |
4339 for (intptr_t j = 0; j < kSize; j++) { | |
4340 if (map->at(j)) { | |
4341 boolean_skip_table.SetUint8(j, kDontSkipArrayEntry); | |
4342 } | |
4343 } | |
4344 } | |
4345 | |
4346 return skip; | |
4347 } | |
4348 | |
4349 | |
4350 // Find the longest range of lookahead that has the fewest number of different | |
4351 // characters that can occur at a given position. Since we are optimizing two | |
4352 // different parameters at once this is a tradeoff. | |
4353 bool BoyerMooreLookahead::FindWorthwhileInterval(intptr_t* from, intptr_t* to) { | |
4354 intptr_t biggest_points = 0; | |
4355 // If more than 32 characters out of 128 can occur it is unlikely that we can | |
4356 // be lucky enough to step forwards much of the time. | |
4357 const intptr_t kMaxMax = 32; | |
4358 for (intptr_t max_number_of_chars = 4; | |
4359 max_number_of_chars < kMaxMax; | |
4360 max_number_of_chars *= 2) { | |
4361 biggest_points = | |
4362 FindBestInterval(max_number_of_chars, biggest_points, from, to); | |
4363 } | |
4364 if (biggest_points == 0) return false; | |
4365 return true; | |
4366 } | |
4367 | |
4368 | |
4369 // Find the highest-points range between 0 and length_ where the character | |
4370 // information is not too vague. 'Too vague' means that there are more than | |
4371 // max_number_of_chars that can occur at this position. Calculates the number | |
4372 // of points as the product of width-of-the-range and | |
4373 // probability-of-finding-one-of-the-characters, where the probability is | |
4374 // calculated using the frequency distribution of the sample subject string. | |
4375 intptr_t BoyerMooreLookahead::FindBestInterval( | |
4376 intptr_t max_number_of_chars, | |
4377 intptr_t old_biggest_points, | |
4378 intptr_t* from, | |
4379 intptr_t* to) { | |
4380 intptr_t biggest_points = old_biggest_points; | |
4381 static const intptr_t kSize = RegExpMacroAssembler::kTableSize; | |
4382 for (intptr_t i = 0; i < length_; ) { | |
4383 while (i < length_ && Count(i) > max_number_of_chars) i++; | |
4384 if (i == length_) break; | |
4385 intptr_t remembered_from = i; | |
4386 bool union_map[kSize]; | |
4387 for (intptr_t j = 0; j < kSize; j++) union_map[j] = false; | |
4388 while (i < length_ && Count(i) <= max_number_of_chars) { | |
4389 BoyerMoorePositionInfo* map = bitmaps_->At(i); | |
4390 for (intptr_t j = 0; j < kSize; j++) union_map[j] |= map->at(j); | |
4391 i++; | |
4392 } | |
4393 intptr_t frequency = 0; | |
4394 for (intptr_t j = 0; j < kSize; j++) { | |
4395 if (union_map[j]) { | |
4396 // Add 1 to the frequency to give a small per-character boost for | |
4397 // the cases where our sampling is not good enough and many | |
4398 // characters have a frequency of zero. This means the frequency | |
4399 // can theoretically be up to 2*kSize though we treat it mostly as | |
4400 // a fraction of kSize. | |
4401 frequency += compiler_->frequency_collator()->Frequency(j) + 1; | |
4402 } | |
4403 } | |
4404 // We use the probability of skipping times the distance we are skipping to | |
4405 // judge the effectiveness of this. Actually we have a cut-off: By | |
4406 // dividing by 2 we switch off the skipping if the probability of skipping | |
4407 // is less than 50%. This is because the multibyte mask-and-compare | |
4408 // skipping in quickcheck is more likely to do well on this case. | |
4409 bool in_quickcheck_range = ((i - remembered_from < 4) || | |
4410 (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2)); | |
4411 // Called 'probability' but it is only a rough estimate and can actually | |
4412 // be outside the 0-kSize range. | |
4413 intptr_t probability = | |
4414 (in_quickcheck_range ? kSize / 2 : kSize) - frequency; | |
4415 intptr_t points = (i - remembered_from) * probability; | |
4416 if (points > biggest_points) { | |
4417 *from = remembered_from; | |
4418 *to = i - 1; | |
4419 biggest_points = points; | |
4420 } | |
4421 } | |
4422 return biggest_points; | |
4423 } | |
4424 | |
4425 | |
4426 // See comment above on the implementation of GetSkipTable. | |
4427 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { | |
4428 const intptr_t kSize = RegExpMacroAssembler::kTableSize; | |
4429 | |
4430 intptr_t min_lookahead = 0; | |
4431 intptr_t max_lookahead = 0; | |
4432 | |
4433 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; | |
4434 | |
4435 bool found_single_character = false; | |
4436 intptr_t single_character = 0; | |
4437 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) { | |
4438 BoyerMoorePositionInfo* map = bitmaps_->At(i); | |
4439 if (map->map_count() > 1 || | |
4440 (found_single_character && map->map_count() != 0)) { | |
4441 found_single_character = false; | |
4442 break; | |
4443 } | |
4444 for (intptr_t j = 0; j < kSize; j++) { | |
4445 if (map->at(j)) { | |
4446 found_single_character = true; | |
4447 single_character = j; | |
4448 break; | |
4449 } | |
4450 } | |
4451 } | |
4452 | |
4453 intptr_t lookahead_width = max_lookahead + 1 - min_lookahead; | |
4454 | |
4455 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { | |
4456 // The mask-compare can probably handle this better. | |
4457 return false; | |
4458 } | |
4459 | |
4460 if (found_single_character) { | |
4461 BlockLabel cont, again; | |
4462 masm->BindBlock(&again); | |
4463 masm->LoadCurrentCharacter(max_lookahead, &cont, true); | |
4464 if (max_char_ > kSize) { | |
4465 masm->CheckCharacterAfterAnd(single_character, | |
4466 RegExpMacroAssembler::kTableMask, | |
4467 &cont); | |
4468 } else { | |
4469 masm->CheckCharacter(single_character, &cont); | |
4470 } | |
4471 masm->AdvanceCurrentPosition(lookahead_width); | |
4472 masm->GoTo(&again); | |
4473 masm->BindBlock(&cont); | |
4474 return true; | |
4475 } | |
4476 | |
4477 const TypedData& boolean_skip_table = TypedData::ZoneHandle( | |
4478 compiler_->isolate(), | |
4479 TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld)); | |
4480 intptr_t skip_distance = GetSkipTable( | |
4481 min_lookahead, max_lookahead, boolean_skip_table); | |
4482 ASSERT(skip_distance != 0); | |
4483 | |
4484 BlockLabel cont, again; | |
4485 | |
4486 masm->BindBlock(&again); | |
4487 masm->LoadCurrentCharacter(max_lookahead, &cont, true); | |
4488 masm->CheckBitInTable(boolean_skip_table, &cont); | |
4489 masm->AdvanceCurrentPosition(skip_distance); | |
4490 masm->GoTo(&again); | |
4491 masm->BindBlock(&cont); | |
4492 | |
4493 return true; | |
4494 } | |
4495 | |
4496 | |
4497 // ------------------------------------------------------------------- | |
4498 // Analysis | |
4499 | |
4500 | |
4501 void Analysis::EnsureAnalyzed(RegExpNode* that) { | |
4502 if (that->info()->been_analyzed || that->info()->being_analyzed) | |
4503 return; | |
4504 that->info()->being_analyzed = true; | |
4505 that->Accept(this); | |
4506 that->info()->being_analyzed = false; | |
4507 that->info()->been_analyzed = true; | |
4508 } | |
4509 | |
4510 | |
4511 void Analysis::VisitEnd(EndNode* that) { | |
4512 // nothing to do | |
4513 } | |
4514 | |
4515 | |
4516 void TextNode::CalculateOffsets() { | |
4517 intptr_t element_count = elements()->length(); | |
4518 // Set up the offsets of the elements relative to the start. This is a fixed | |
4519 // quantity since a TextNode can only contain fixed-width things. | |
4520 intptr_t cp_offset = 0; | |
4521 for (intptr_t i = 0; i < element_count; i++) { | |
4522 TextElement& elm = (*elements())[i]; | |
4523 elm.set_cp_offset(cp_offset); | |
4524 cp_offset += elm.length(); | |
4525 } | |
4526 } | |
4527 | |
4528 | |
4529 void Analysis::VisitText(TextNode* that) { | |
4530 if (ignore_case_) { | |
4531 that->MakeCaseIndependent(is_ascii_); | |
4532 } | |
4533 EnsureAnalyzed(that->on_success()); | |
4534 if (!has_failed()) { | |
4535 that->CalculateOffsets(); | |
4536 } | |
4537 } | |
4538 | |
4539 | |
4540 void Analysis::VisitAction(ActionNode* that) { | |
4541 RegExpNode* target = that->on_success(); | |
4542 EnsureAnalyzed(target); | |
4543 if (!has_failed()) { | |
4544 // If the next node is interested in what it follows then this node | |
4545 // has to be interested too so it can pass the information on. | |
4546 that->info()->AddFromFollowing(target->info()); | |
4547 } | |
4548 } | |
4549 | |
4550 | |
4551 void Analysis::VisitChoice(ChoiceNode* that) { | |
4552 NodeInfo* info = that->info(); | |
4553 for (intptr_t i = 0; i < that->alternatives()->length(); i++) { | |
4554 RegExpNode* node = (*that->alternatives())[i].node(); | |
4555 EnsureAnalyzed(node); | |
4556 if (has_failed()) return; | |
4557 // Anything the following nodes need to know has to be known by | |
4558 // this node also, so it can pass it on. | |
4559 info->AddFromFollowing(node->info()); | |
4560 } | |
4561 } | |
4562 | |
4563 | |
4564 void Analysis::VisitLoopChoice(LoopChoiceNode* that) { | |
4565 NodeInfo* info = that->info(); | |
4566 for (intptr_t i = 0; i < that->alternatives()->length(); i++) { | |
4567 RegExpNode* node = (*that->alternatives())[i].node(); | |
4568 if (node != that->loop_node()) { | |
4569 EnsureAnalyzed(node); | |
4570 if (has_failed()) return; | |
4571 info->AddFromFollowing(node->info()); | |
4572 } | |
4573 } | |
4574 // Check the loop last since it may need the value of this node | |
4575 // to get a correct result. | |
4576 EnsureAnalyzed(that->loop_node()); | |
4577 if (!has_failed()) { | |
4578 info->AddFromFollowing(that->loop_node()->info()); | |
4579 } | |
4580 } | |
4581 | |
4582 | |
4583 void Analysis::VisitBackReference(BackReferenceNode* that) { | |
4584 EnsureAnalyzed(that->on_success()); | |
4585 } | |
4586 | |
4587 | |
4588 void Analysis::VisitAssertion(AssertionNode* that) { | |
4589 EnsureAnalyzed(that->on_success()); | |
4590 } | |
4591 | |
4592 | |
4593 // ------------------------------------------------------------------- | |
4594 // Dot/dotty output | |
4595 | |
4596 | |
4597 #ifdef DEBUG | |
4598 | |
4599 | |
4600 class DotPrinter: public NodeVisitor { | |
4601 public: | |
4602 explicit DotPrinter(bool ignore_case) | |
4603 : ignore_case_(ignore_case) {} | |
4604 void PrintNode(const char* label, RegExpNode* node); | |
4605 void Visit(RegExpNode* node); | |
4606 void PrintAttributes(RegExpNode* from); | |
4607 void PrintOnFailure(RegExpNode* from, RegExpNode* to); | |
4608 #define DECLARE_VISIT(Type) \ | |
4609 virtual void Visit##Type(Type##Node* that); | |
4610 FOR_EACH_NODE_TYPE(DECLARE_VISIT) | |
4611 #undef DECLARE_VISIT | |
4612 private: | |
4613 bool ignore_case_; | |
4614 }; | |
4615 | |
4616 | |
4617 void DotPrinter::PrintNode(const char* label, RegExpNode* node) { | |
4618 OS::Print("digraph G {\n graph [label=\""); | |
4619 for (intptr_t i = 0; label[i]; i++) { | |
4620 switch (label[i]) { | |
4621 case '\\': | |
4622 OS::Print("\\\\"); | |
4623 break; | |
4624 case '"': | |
4625 OS::Print("\""); | |
4626 break; | |
4627 default: | |
4628 OS::Print("%c", label[i]); | |
4629 break; | |
4630 } | |
4631 } | |
4632 OS::Print("\"];\n"); | |
4633 Visit(node); | |
4634 OS::Print("}\n"); | |
4635 } | |
4636 | |
4637 | |
4638 void DotPrinter::Visit(RegExpNode* node) { | |
4639 if (node->info()->visited) return; | |
4640 node->info()->visited = true; | |
4641 node->Accept(this); | |
4642 } | |
4643 | |
4644 | |
4645 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { | |
4646 OS::Print(" n%p -> n%p [style=dotted];\n", from, on_failure); | |
4647 Visit(on_failure); | |
4648 } | |
4649 | |
4650 | |
4651 class AttributePrinter : public ValueObject { | |
4652 public: | |
4653 AttributePrinter() : first_(true) {} | |
4654 void PrintSeparator() { | |
4655 if (first_) { | |
4656 first_ = false; | |
4657 } else { | |
4658 OS::Print("|"); | |
4659 } | |
4660 } | |
4661 void PrintBit(const char* name, bool value) { | |
4662 if (!value) return; | |
4663 PrintSeparator(); | |
4664 OS::Print("{%s}", name); | |
4665 } | |
4666 void PrintPositive(const char* name, intptr_t value) { | |
4667 if (value < 0) return; | |
4668 PrintSeparator(); | |
4669 OS::Print("{%s|%" Pd "}", name, value); | |
4670 } | |
4671 | |
4672 private: | |
4673 bool first_; | |
4674 }; | |
4675 | |
4676 | |
4677 void DotPrinter::PrintAttributes(RegExpNode* that) { | |
4678 OS::Print(" a%p [shape=Mrecord, color=grey, fontcolor=grey, " | |
4679 "margin=0.1, fontsize=10, label=\"{", that); | |
4680 AttributePrinter printer; | |
4681 NodeInfo* info = that->info(); | |
4682 printer.PrintBit("NI", info->follows_newline_interest); | |
4683 printer.PrintBit("WI", info->follows_word_interest); | |
4684 printer.PrintBit("SI", info->follows_start_interest); | |
4685 BlockLabel* label = that->label(); | |
4686 if (label->IsBound()) | |
4687 printer.PrintPositive("@", label->Position()); | |
4688 OS::Print("}\"];\n" | |
4689 " a%p -> n%p [style=dashed, color=grey, arrowhead=none];\n", | |
4690 that, that); | |
4691 } | |
4692 | |
4693 | |
4694 static const bool kPrintDispatchTable = false; | |
4695 void DotPrinter::VisitChoice(ChoiceNode* that) { | |
4696 if (kPrintDispatchTable) { | |
4697 OS::Print(" n%p [shape=Mrecord, label=\"", that); | |
4698 UNIMPLEMENTED(); | |
4699 } else { | |
4700 OS::Print(" n%p [shape=Mrecord, label=\"?\"];\n", that); | |
4701 for (intptr_t i = 0; i < that->alternatives()->length(); i++) { | |
4702 GuardedAlternative alt = that->alternatives()->At(i); | |
4703 OS::Print(" n%p -> n%p", that, alt.node()); | |
4704 } | |
4705 } | |
4706 for (intptr_t i = 0; i < that->alternatives()->length(); i++) { | |
4707 GuardedAlternative alt = that->alternatives()->At(i); | |
4708 alt.node()->Accept(this); | |
4709 } | |
4710 } | |
4711 | |
4712 | |
4713 void DotPrinter::VisitText(TextNode* that) { | |
4714 OS::Print(" n%p [label=\"", that); | |
4715 for (intptr_t i = 0; i < that->elements()->length(); i++) { | |
4716 if (i > 0) OS::Print(" "); | |
4717 TextElement elm = that->elements()->At(i); | |
4718 switch (elm.text_type()) { | |
4719 case TextElement::ATOM: { | |
4720 ZoneGrowableArray<uint16_t>* data = elm.atom()->data(); | |
4721 for (intptr_t i = 0; i < data->length(); i++) { | |
4722 OS::Print("%c", static_cast<char>(data->At(i))); | |
4723 } | |
4724 break; | |
4725 } | |
4726 case TextElement::CHAR_CLASS: { | |
4727 RegExpCharacterClass* node = elm.char_class(); | |
4728 OS::Print("["); | |
4729 if (node->is_negated()) OS::Print("^"); | |
4730 for (intptr_t j = 0; j < node->ranges()->length(); j++) { | |
4731 CharacterRange range = node->ranges()->At(j); | |
4732 PrintUtf16(range.from()); | |
4733 OS::Print("-"); | |
4734 PrintUtf16(range.to()); | |
4735 } | |
4736 OS::Print("]"); | |
4737 break; | |
4738 } | |
4739 default: | |
4740 UNREACHABLE(); | |
4741 } | |
4742 } | |
4743 OS::Print("\", shape=box, peripheries=2];\n"); | |
4744 PrintAttributes(that); | |
4745 OS::Print(" n%p -> n%p;\n", that, that->on_success()); | |
4746 Visit(that->on_success()); | |
4747 } | |
4748 | |
4749 | |
4750 void DotPrinter::VisitBackReference(BackReferenceNode* that) { | |
4751 OS::Print(" n%p [label=\"$%" Pd "..$%" Pd "\", shape=doubleoctagon];\n", | |
4752 that, that->start_register(), that->end_register()); | |
4753 PrintAttributes(that); | |
4754 OS::Print(" n%p -> n%p;\n", that, that->on_success()); | |
4755 Visit(that->on_success()); | |
4756 } | |
4757 | |
4758 | |
4759 void DotPrinter::VisitEnd(EndNode* that) { | |
4760 OS::Print(" n%p [style=bold, shape=point];\n", that); | |
4761 PrintAttributes(that); | |
4762 } | |
4763 | |
4764 | |
4765 void DotPrinter::VisitAssertion(AssertionNode* that) { | |
4766 OS::Print(" n%p [", that); | |
4767 switch (that->assertion_type()) { | |
4768 case AssertionNode::AT_END: | |
4769 OS::Print("label=\"$\", shape=septagon"); | |
4770 break; | |
4771 case AssertionNode::AT_START: | |
4772 OS::Print("label=\"^\", shape=septagon"); | |
4773 break; | |
4774 case AssertionNode::AT_BOUNDARY: | |
4775 OS::Print("label=\"\\b\", shape=septagon"); | |
4776 break; | |
4777 case AssertionNode::AT_NON_BOUNDARY: | |
4778 OS::Print("label=\"\\B\", shape=septagon"); | |
4779 break; | |
4780 case AssertionNode::AFTER_NEWLINE: | |
4781 OS::Print("label=\"(?<=\\n)\", shape=septagon"); | |
4782 break; | |
4783 } | |
4784 OS::Print("];\n"); | |
4785 PrintAttributes(that); | |
4786 RegExpNode* successor = that->on_success(); | |
4787 OS::Print(" n%p -> n%p;\n", that, successor); | |
4788 Visit(successor); | |
4789 } | |
4790 | |
4791 | |
4792 void DotPrinter::VisitAction(ActionNode* that) { | |
4793 OS::Print(" n%p [", that); | |
4794 switch (that->action_type_) { | |
4795 case ActionNode::SET_REGISTER: | |
4796 OS::Print("label=\"$%" Pd ":=%" Pd "\", shape=octagon", | |
4797 that->data_.u_store_register.reg, | |
4798 that->data_.u_store_register.value); | |
4799 break; | |
4800 case ActionNode::INCREMENT_REGISTER: | |
4801 OS::Print("label=\"$%" Pd "++\", shape=octagon", | |
4802 that->data_.u_increment_register.reg); | |
4803 break; | |
4804 case ActionNode::STORE_POSITION: | |
4805 OS::Print("label=\"$%" Pd ":=$pos\", shape=octagon", | |
4806 that->data_.u_position_register.reg); | |
4807 break; | |
4808 case ActionNode::BEGIN_SUBMATCH: | |
4809 OS::Print("label=\"$%" Pd ":=$pos,begin\", shape=septagon", | |
4810 that->data_.u_submatch.current_position_register); | |
4811 break; | |
4812 case ActionNode::POSITIVE_SUBMATCH_SUCCESS: | |
4813 OS::Print("label=\"escape\", shape=septagon"); | |
4814 break; | |
4815 case ActionNode::EMPTY_MATCH_CHECK: | |
4816 OS::Print("label=\"$%" Pd "=$pos?,$%" Pd "<%" Pd "?\", shape=septagon", | |
4817 that->data_.u_empty_match_check.start_register, | |
4818 that->data_.u_empty_match_check.repetition_register, | |
4819 that->data_.u_empty_match_check.repetition_limit); | |
4820 break; | |
4821 case ActionNode::CLEAR_CAPTURES: { | |
4822 OS::Print("label=\"clear $%" Pd " to $%" Pd "\", shape=septagon", | |
4823 that->data_.u_clear_captures.range_from, | |
4824 that->data_.u_clear_captures.range_to); | |
4825 break; | |
4826 } | |
4827 } | |
4828 OS::Print("];\n"); | |
4829 PrintAttributes(that); | |
4830 RegExpNode* successor = that->on_success(); | |
4831 OS::Print(" n%p -> n%p;\n", that, successor); | |
4832 Visit(successor); | |
4833 } | |
4834 | |
4835 | |
4836 void RegExpEngine::DotPrint(const char* label, | |
4837 RegExpNode* node, | |
4838 bool ignore_case) { | |
4839 DotPrinter printer(ignore_case); | |
4840 printer.PrintNode(label, node); | |
4841 } | |
4842 | |
4843 | |
4844 #endif // DEBUG | |
4845 | |
4846 } // namespace dart | |
OLD | NEW |