Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(207)

Side by Side Diff: runtime/vm/regexp.cc

Issue 539153002: Port and integrate the irregexp engine from V8 (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Addressed Ivan's comments. Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « runtime/vm/regexp.h ('k') | runtime/vm/regexp_assembler.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 #include "vm/regexp.h"
6
7 #include "vm/dart_entry.h"
8 #include "vm/regexp_assembler.h"
9 #include "vm/regexp_ast.h"
10 #include "vm/unibrow-inl.h"
11 #include "vm/unicode.h"
12 #include "vm/symbols.h"
13
14 #define I isolate()
15 #define CI compiler->isolate()
16
17 namespace dart {
18
19 DECLARE_FLAG(bool, trace_irregexp);
20
21 #define DEFINE_ACCEPT(Type) \
22 void Type##Node::Accept(NodeVisitor* visitor) { \
23 visitor->Visit##Type(this); \
24 }
25 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
26 #undef DEFINE_ACCEPT
27
28
29 // Default to generating optimized regexp code.
30 static const bool kRegexpOptimization = true;
31
32 // More makes code generation slower, less makes V8 benchmark score lower.
33 static const intptr_t kMaxLookaheadForBoyerMoore = 8;
34
35 // In a 3-character pattern you can maximally step forwards 3 characters
36 // at a time, which is not always enough to pay for the extra logic.
37 static const intptr_t kPatternTooShortForBoyerMoore = 2;
38
39 // The '2' variant has inclusive from and exclusive to.
40 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
41 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
42 static const intptr_t kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
43 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
44 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
45 0xFEFF, 0xFF00, 0x10000 };
46 static const intptr_t kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
47 static const intptr_t kWordRanges[] = {
48 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
49 static const intptr_t kWordRangeCount = ARRAY_SIZE(kWordRanges);
50 static const intptr_t kDigitRanges[] = { '0', '9' + 1, 0x10000 };
51 static const intptr_t kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
52 static const intptr_t kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
53 static const intptr_t kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges);
54 static const intptr_t kLineTerminatorRanges[] = {
55 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, 0x10000 };
56 static const intptr_t kLineTerminatorRangeCount =
57 ARRAY_SIZE(kLineTerminatorRanges);
58
59
60 static inline void PrintUtf16(uint16_t c) {
61 const char* format = (0x20 <= c && c <= 0x7F) ?
62 "%c" : (c <= 0xff) ? "\\x%02x" : "\\u%04x";
63 OS::Print(format, c);
64 }
65
66
67 // We need to check for the following characters: 0x39c 0x3bc 0x178.
68 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
69 // TODO(dcarney): this could be a lot more efficient.
70 return range.Contains(0x39c) ||
71 range.Contains(0x3bc) || range.Contains(0x178);
72 }
73
74
75 static bool RangesContainLatin1Equivalents(
76 ZoneGrowableArray<CharacterRange>* ranges) {
77 for (intptr_t i = 0; i < ranges->length(); i++) {
78 // TODO(dcarney): this could be a lot more efficient.
79 if (RangeContainsLatin1Equivalents(ranges->At(i))) return true;
80 }
81 return false;
82 }
83
84 static uint16_t ConvertNonLatin1ToLatin1(uint16_t c) {
85 ASSERT(c > Symbols::kMaxOneCharCodeSymbol);
86 switch (c) {
87 // This are equivalent characters in unicode.
88 case 0x39c:
89 case 0x3bc:
90 return 0xb5;
91 // This is an uppercase of a Latin-1 character
92 // outside of Latin-1.
93 case 0x178:
94 return 0xff;
95 }
96 return 0;
97 }
98
99
100 class VisitMarker : public ValueObject {
101 public:
102 explicit VisitMarker(NodeInfo* info) : info_(info) {
103 ASSERT(!info->visited);
104 info->visited = true;
105 }
106 ~VisitMarker() {
107 info_->visited = false;
108 }
109 private:
110 NodeInfo* info_;
111 };
112
113
114 class FrequencyCollator : public ValueObject {
115 public:
116 FrequencyCollator() : total_samples_(0) {
117 for (intptr_t i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
118 frequencies_[i] = CharacterFrequency(i);
119 }
120 }
121
122 void CountCharacter(intptr_t character) {
123 intptr_t index = (character & RegExpMacroAssembler::kTableMask);
124 frequencies_[index].Increment();
125 total_samples_++;
126 }
127
128 // Does not measure in percent, but rather per-128 (the table size from the
129 // regexp macro assembler).
130 intptr_t Frequency(intptr_t in_character) {
131 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);
132 if (total_samples_ < 1) return 1; // Division by zero.
133 intptr_t freq_in_per128 =
134 (frequencies_[in_character].counter() * 128) / total_samples_;
135 return freq_in_per128;
136 }
137
138 private:
139 class CharacterFrequency {
140 public:
141 CharacterFrequency() : counter_(0), character_(-1) { }
142 explicit CharacterFrequency(intptr_t character)
143 : counter_(0), character_(character) { }
144
145 void Increment() { counter_++; }
146 intptr_t counter() { return counter_; }
147 intptr_t character() { return character_; }
148
149 private:
150 intptr_t counter_;
151 intptr_t character_;
152
153 DISALLOW_ALLOCATION();
154 };
155
156
157 private:
158 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
159 intptr_t total_samples_;
160 };
161
162
163 class RegExpCompiler : public ValueObject {
164 public:
165 RegExpCompiler(intptr_t capture_count,
166 bool ignore_case,
167 intptr_t specialization_cid);
168
169 intptr_t AllocateRegister() {
170 return next_register_++;
171 }
172
173 RegExpEngine::CompilationResult Assemble(IRRegExpMacroAssembler* assembler,
174 RegExpNode* start,
175 intptr_t capture_count,
176 const String& pattern);
177
178 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
179
180 static const intptr_t kImplementationOffset = 0;
181 static const intptr_t kNumberOfRegistersOffset = 0;
182 static const intptr_t kCodeOffset = 1;
183
184 IRRegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
185 EndNode* accept() { return accept_; }
186
187 static const intptr_t kMaxRecursion = 100;
188 inline intptr_t recursion_depth() { return recursion_depth_; }
189 inline void IncrementRecursionDepth() { recursion_depth_++; }
190 inline void DecrementRecursionDepth() { recursion_depth_--; }
191
192 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
193
194 inline bool ignore_case() { return ignore_case_; }
195 inline bool ascii() const {
196 return (specialization_cid_ == kOneByteStringCid ||
197 specialization_cid_ == kExternalOneByteStringCid);
198 }
199 inline intptr_t specialization_cid() { return specialization_cid_; }
200 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
201
202 intptr_t current_expansion_factor() { return current_expansion_factor_; }
203 void set_current_expansion_factor(intptr_t value) {
204 current_expansion_factor_ = value;
205 }
206
207 Isolate* isolate() const { return isolate_; }
208
209 static const intptr_t kNoRegister = -1;
210
211 private:
212 EndNode* accept_;
213 intptr_t next_register_;
214 ZoneGrowableArray<RegExpNode*>* work_list_;
215 intptr_t recursion_depth_;
216 IRRegExpMacroAssembler* macro_assembler_;
217 bool ignore_case_;
218 intptr_t specialization_cid_;
219 bool reg_exp_too_big_;
220 intptr_t current_expansion_factor_;
221 FrequencyCollator frequency_collator_;
222 Isolate* isolate_;
223 };
224
225
226 class RecursionCheck : public ValueObject {
227 public:
228 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
229 compiler->IncrementRecursionDepth();
230 }
231 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
232 private:
233 RegExpCompiler* compiler_;
234 };
235
236
237 // Scoped object to keep track of how much we unroll quantifier loops in the
238 // regexp graph generator.
239 class RegExpExpansionLimiter : public ValueObject {
240 public:
241 static const intptr_t kMaxExpansionFactor = 6;
242 RegExpExpansionLimiter(RegExpCompiler* compiler, intptr_t factor)
243 : compiler_(compiler),
244 saved_expansion_factor_(compiler->current_expansion_factor()),
245 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
246 ASSERT(factor > 0);
247 if (ok_to_expand_) {
248 if (factor > kMaxExpansionFactor) {
249 // Avoid integer overflow of the current expansion factor.
250 ok_to_expand_ = false;
251 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
252 } else {
253 intptr_t new_factor = saved_expansion_factor_ * factor;
254 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
255 compiler->set_current_expansion_factor(new_factor);
256 }
257 }
258 }
259
260 ~RegExpExpansionLimiter() {
261 compiler_->set_current_expansion_factor(saved_expansion_factor_);
262 }
263
264 bool ok_to_expand() { return ok_to_expand_; }
265
266 private:
267 RegExpCompiler* compiler_;
268 intptr_t saved_expansion_factor_;
269 bool ok_to_expand_;
270
271 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
272 };
273
274
275 // Node generation -------------------------------------------------------------
276
277 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
278 RegExpNode* on_success) {
279 ZoneGrowableArray<TextElement>* elms =
280 new(CI) ZoneGrowableArray<TextElement>(1);
281 elms->Add(TextElement::Atom(this));
282 return new(CI) TextNode(elms, on_success);
283 }
284
285
286 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
287 RegExpNode* on_success) {
288 ZoneGrowableArray<TextElement>* elms =
289 new(CI) ZoneGrowableArray<TextElement>(1);
290 for (intptr_t i = 0; i < elements()->length(); i++) {
291 elms->Add(elements()->At(i));
292 }
293 return new(CI) TextNode(elms, on_success);
294 }
295
296
297 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
298 RegExpNode* on_success) {
299 return new(CI) TextNode(this, on_success);
300 }
301
302
303 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
304 RegExpNode* on_success) {
305 ZoneGrowableArray<RegExpTree*>* alternatives = this->alternatives();
306 intptr_t length = alternatives->length();
307 ChoiceNode* result =
308 new(CI) ChoiceNode(length, CI);
309 for (intptr_t i = 0; i < length; i++) {
310 GuardedAlternative alternative(alternatives->At(i)->ToNode(compiler,
311 on_success));
312 result->AddAlternative(alternative);
313 }
314 return result;
315 }
316
317
318 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
319 RegExpNode* on_success) {
320 return ToNode(min(),
321 max(),
322 is_greedy(),
323 body(),
324 compiler,
325 on_success);
326 }
327
328
329 RegExpNode* RegExpQuantifier::ToNode(intptr_t min,
330 intptr_t max,
331 bool is_greedy,
332 RegExpTree* body,
333 RegExpCompiler* compiler,
334 RegExpNode* on_success,
335 bool not_at_start) {
336 // x{f, t} becomes this:
337 //
338 // (r++)<-.
339 // | `
340 // | (x)
341 // v ^
342 // (r=0)-->(?)---/ [if r < t]
343 // |
344 // [if r >= f] \----> ...
345 //
346
347 // 15.10.2.5 RepeatMatcher algorithm.
348 // The parser has already eliminated the case where max is 0. In the case
349 // where max_match is zero the parser has removed the quantifier if min was
350 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
351
352 // If we know that we cannot match zero length then things are a little
353 // simpler since we don't need to make the special zero length match check
354 // from step 2.1. If the min and max are small we can unroll a little in
355 // this case.
356 // Unroll (foo)+ and (foo){3,}
357 static const intptr_t kMaxUnrolledMinMatches = 3;
358 // Unroll (foo)? and (foo){x,3}
359 static const intptr_t kMaxUnrolledMaxMatches = 3;
360 if (max == 0) return on_success; // This can happen due to recursion.
361 bool body_can_be_empty = (body->min_match() == 0);
362 intptr_t body_start_reg = RegExpCompiler::kNoRegister;
363 Interval capture_registers = body->CaptureRegisters();
364 bool needs_capture_clearing = !capture_registers.is_empty();
365 Isolate* isolate = compiler->isolate();
366
367 if (body_can_be_empty) {
368 body_start_reg = compiler->AllocateRegister();
369 } else if (kRegexpOptimization && !needs_capture_clearing) {
370 // Only unroll if there are no captures and the body can't be
371 // empty.
372 {
373 RegExpExpansionLimiter limiter(
374 compiler, min + ((max != min) ? 1 : 0));
375 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
376 intptr_t new_max = (max == kInfinity) ? max : max - min;
377 // Recurse once to get the loop or optional matches after the fixed
378 // ones.
379 RegExpNode* answer = ToNode(
380 0, new_max, is_greedy, body, compiler, on_success, true);
381 // Unroll the forced matches from 0 to min. This can cause chains of
382 // TextNodes (which the parser does not generate). These should be
383 // combined if it turns out they hinder good code generation.
384 for (intptr_t i = 0; i < min; i++) {
385 answer = body->ToNode(compiler, answer);
386 }
387 return answer;
388 }
389 }
390 if (max <= kMaxUnrolledMaxMatches && min == 0) {
391 ASSERT(max > 0); // Due to the 'if' above.
392 RegExpExpansionLimiter limiter(compiler, max);
393 if (limiter.ok_to_expand()) {
394 // Unroll the optional matches up to max.
395 RegExpNode* answer = on_success;
396 for (intptr_t i = 0; i < max; i++) {
397 ChoiceNode* alternation = new(isolate) ChoiceNode(2, isolate);
398 if (is_greedy) {
399 alternation->AddAlternative(
400 GuardedAlternative(body->ToNode(compiler, answer)));
401 alternation->AddAlternative(GuardedAlternative(on_success));
402 } else {
403 alternation->AddAlternative(GuardedAlternative(on_success));
404 alternation->AddAlternative(
405 GuardedAlternative(body->ToNode(compiler, answer)));
406 }
407 answer = alternation;
408 if (not_at_start) alternation->set_not_at_start();
409 }
410 return answer;
411 }
412 }
413 }
414 bool has_min = min > 0;
415 bool has_max = max < RegExpTree::kInfinity;
416 bool needs_counter = has_min || has_max;
417 intptr_t reg_ctr = needs_counter
418 ? compiler->AllocateRegister()
419 : RegExpCompiler::kNoRegister;
420 LoopChoiceNode* center = new(isolate) LoopChoiceNode(body->min_match() == 0,
421 isolate);
422 if (not_at_start) center->set_not_at_start();
423 RegExpNode* loop_return = needs_counter
424 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
425 : static_cast<RegExpNode*>(center);
426 if (body_can_be_empty) {
427 // If the body can be empty we need to check if it was and then
428 // backtrack.
429 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
430 reg_ctr,
431 min,
432 loop_return);
433 }
434 RegExpNode* body_node = body->ToNode(compiler, loop_return);
435 if (body_can_be_empty) {
436 // If the body can be empty we need to store the start position
437 // so we can bail out if it was empty.
438 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
439 }
440 if (needs_capture_clearing) {
441 // Before entering the body of this loop we need to clear captures.
442 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
443 }
444 GuardedAlternative body_alt(body_node);
445 if (has_max) {
446 Guard* body_guard =
447 new(isolate) Guard(reg_ctr, Guard::LT, max);
448 body_alt.AddGuard(body_guard, isolate);
449 }
450 GuardedAlternative rest_alt(on_success);
451 if (has_min) {
452 Guard* rest_guard = new(isolate) Guard(reg_ctr, Guard::GEQ, min);
453 rest_alt.AddGuard(rest_guard, isolate);
454 }
455 if (is_greedy) {
456 center->AddLoopAlternative(body_alt);
457 center->AddContinueAlternative(rest_alt);
458 } else {
459 center->AddContinueAlternative(rest_alt);
460 center->AddLoopAlternative(body_alt);
461 }
462 if (needs_counter) {
463 return ActionNode::SetRegister(reg_ctr, 0, center);
464 } else {
465 return center;
466 }
467 }
468
469
470 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
471 RegExpNode* on_success) {
472 switch (assertion_type()) {
473 case START_OF_LINE:
474 return AssertionNode::AfterNewline(on_success);
475 case START_OF_INPUT:
476 return AssertionNode::AtStart(on_success);
477 case BOUNDARY:
478 return AssertionNode::AtBoundary(on_success);
479 case NON_BOUNDARY:
480 return AssertionNode::AtNonBoundary(on_success);
481 case END_OF_INPUT:
482 return AssertionNode::AtEnd(on_success);
483 case END_OF_LINE: {
484 // Compile $ in multiline regexps as an alternation with a positive
485 // lookahead in one side and an end-of-input on the other side.
486 // We need two registers for the lookahead.
487 intptr_t stack_pointer_register = compiler->AllocateRegister();
488 intptr_t position_register = compiler->AllocateRegister();
489 // The ChoiceNode to distinguish between a newline and end-of-input.
490 ChoiceNode* result = new ChoiceNode(2, on_success->isolate());
491 // Create a newline atom.
492 ZoneGrowableArray<CharacterRange>* newline_ranges =
493 new ZoneGrowableArray<CharacterRange>(3);
494 CharacterRange::AddClassEscape('n', newline_ranges);
495 RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
496 TextNode* newline_matcher = new TextNode(
497 newline_atom,
498 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
499 position_register,
500 0, // No captures inside.
501 -1, // Ignored if no captures.
502 on_success));
503 // Create an end-of-input matcher.
504 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
505 stack_pointer_register,
506 position_register,
507 newline_matcher);
508 // Add the two alternatives to the ChoiceNode.
509 GuardedAlternative eol_alternative(end_of_line);
510 result->AddAlternative(eol_alternative);
511 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
512 result->AddAlternative(end_alternative);
513 return result;
514 }
515 default:
516 UNREACHABLE();
517 }
518 return on_success;
519 }
520
521
522 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
523 RegExpNode* on_success) {
524 return new(CI)
525 BackReferenceNode(RegExpCapture::StartRegister(index()),
526 RegExpCapture::EndRegister(index()),
527 on_success);
528 }
529
530
531 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
532 RegExpNode* on_success) {
533 return on_success;
534 }
535
536
537 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
538 RegExpNode* on_success) {
539 intptr_t stack_pointer_register = compiler->AllocateRegister();
540 intptr_t position_register = compiler->AllocateRegister();
541
542 const intptr_t registers_per_capture = 2;
543 const intptr_t register_of_first_capture = 2;
544 intptr_t register_count = capture_count_ * registers_per_capture;
545 intptr_t register_start =
546 register_of_first_capture + capture_from_ * registers_per_capture;
547
548 RegExpNode* success;
549 if (is_positive()) {
550 RegExpNode* node = ActionNode::BeginSubmatch(
551 stack_pointer_register,
552 position_register,
553 body()->ToNode(
554 compiler,
555 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
556 position_register,
557 register_count,
558 register_start,
559 on_success)));
560 return node;
561 } else {
562 // We use a ChoiceNode for a negative lookahead because it has most of
563 // the characteristics we need. It has the body of the lookahead as its
564 // first alternative and the expression after the lookahead of the second
565 // alternative. If the first alternative succeeds then the
566 // NegativeSubmatchSuccess will unwind the stack including everything the
567 // choice node set up and backtrack. If the first alternative fails then
568 // the second alternative is tried, which is exactly the desired result
569 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
570 // ChoiceNode that knows to ignore the first exit when calculating quick
571 // checks.
572
573 GuardedAlternative body_alt(
574 body()->ToNode(
575 compiler,
576 success = new(CI) NegativeSubmatchSuccess(stack_pointer_register,
577 position_register,
578 register_count,
579 register_start,
580 CI)));
581 ChoiceNode* choice_node =
582 new(CI) NegativeLookaheadChoiceNode(body_alt,
583 GuardedAlternative(on_success),
584 CI);
585 return ActionNode::BeginSubmatch(stack_pointer_register,
586 position_register,
587 choice_node);
588 }
589 }
590
591
592 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
593 RegExpNode* on_success) {
594 return ToNode(body(), index(), compiler, on_success);
595 }
596
597
598 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
599 intptr_t index,
600 RegExpCompiler* compiler,
601 RegExpNode* on_success) {
602 intptr_t start_reg = RegExpCapture::StartRegister(index);
603 intptr_t end_reg = RegExpCapture::EndRegister(index);
604 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
605 RegExpNode* body_node = body->ToNode(compiler, store_end);
606 return ActionNode::StorePosition(start_reg, true, body_node);
607 }
608
609
610 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
611 RegExpNode* on_success) {
612 ZoneGrowableArray<RegExpTree*>* children = nodes();
613 RegExpNode* current = on_success;
614 for (intptr_t i = children->length() - 1; i >= 0; i--) {
615 current = children->At(i)->ToNode(compiler, current);
616 }
617 return current;
618 }
619
620
621 // ASCII filtering -------------------------------------------------------------
622
623
624 RegExpNode* SeqRegExpNode::FilterSuccessor(intptr_t depth, bool ignore_case) {
625 RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case);
626 if (next == NULL) return set_replacement(NULL);
627 on_success_ = next;
628 return set_replacement(this);
629 }
630
631
632 RegExpNode* SeqRegExpNode::FilterASCII(intptr_t depth, bool ignore_case) {
633 if (info()->replacement_calculated) return replacement();
634 if (depth < 0) return this;
635 ASSERT(!info()->visited);
636 VisitMarker marker(info());
637 return FilterSuccessor(depth - 1, ignore_case);
638 }
639
640
641 RegExpNode* TextNode::FilterASCII(intptr_t depth, bool ignore_case) {
642 if (info()->replacement_calculated) return replacement();
643 if (depth < 0) return this;
644 ASSERT(!info()->visited);
645 VisitMarker marker(info());
646 intptr_t element_count = elms_->length();
647 for (intptr_t i = 0; i < element_count; i++) {
648 TextElement elm = elms_->At(i);
649 if (elm.text_type() == TextElement::ATOM) {
650 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
651 for (intptr_t j = 0; j < quarks->length(); j++) {
652 uint16_t c = quarks->At(j);
653 if (c <= Symbols::kMaxOneCharCodeSymbol) continue;
654 if (!ignore_case) return set_replacement(NULL);
655 // Here, we need to check for characters whose upper and lower cases
656 // are outside the Latin-1 range.
657 uint16_t converted = ConvertNonLatin1ToLatin1(c);
658 // Character is outside Latin-1 completely
659 if (converted == 0) return set_replacement(NULL);
660 // Convert quark to Latin-1 in place.
661 (*quarks)[0] = converted;
662 }
663 } else {
664 ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
665 RegExpCharacterClass* cc = elm.char_class();
666 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
667 if (!CharacterRange::IsCanonical(ranges)) {
668 CharacterRange::Canonicalize(ranges);
669 }
670 // Now they are in order so we only need to look at the first.
671 intptr_t range_count = ranges->length();
672 if (cc->is_negated()) {
673 if (range_count != 0 &&
674 ranges->At(0).from() == 0 &&
675 ranges->At(0).to() >= Symbols::kMaxOneCharCodeSymbol) {
676 // This will be handled in a later filter.
677 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
678 return set_replacement(NULL);
679 }
680 } else {
681 if (range_count == 0 ||
682 ranges->At(0).from() > Symbols::kMaxOneCharCodeSymbol) {
683 // This will be handled in a later filter.
684 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
685 return set_replacement(NULL);
686 }
687 }
688 }
689 }
690 return FilterSuccessor(depth - 1, ignore_case);
691 }
692
693
694 RegExpNode* LoopChoiceNode::FilterASCII(intptr_t depth, bool ignore_case) {
695 if (info()->replacement_calculated) return replacement();
696 if (depth < 0) return this;
697 if (info()->visited) return this;
698 {
699 VisitMarker marker(info());
700
701 RegExpNode* continue_replacement =
702 continue_node_->FilterASCII(depth - 1, ignore_case);
703 // If we can't continue after the loop then there is no sense in doing the
704 // loop.
705 if (continue_replacement == NULL) return set_replacement(NULL);
706 }
707
708 return ChoiceNode::FilterASCII(depth - 1, ignore_case);
709 }
710
711
712 RegExpNode* ChoiceNode::FilterASCII(intptr_t depth, bool ignore_case) {
713 if (info()->replacement_calculated) return replacement();
714 if (depth < 0) return this;
715 if (info()->visited) return this;
716 VisitMarker marker(info());
717 intptr_t choice_count = alternatives_->length();
718
719 for (intptr_t i = 0; i < choice_count; i++) {
720 GuardedAlternative alternative = alternatives_->At(i);
721 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
722 set_replacement(this);
723 return this;
724 }
725 }
726
727 intptr_t surviving = 0;
728 RegExpNode* survivor = NULL;
729 for (intptr_t i = 0; i < choice_count; i++) {
730 GuardedAlternative alternative = alternatives_->At(i);
731 RegExpNode* replacement =
732 alternative.node()->FilterASCII(depth - 1, ignore_case);
733 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
734 if (replacement != NULL) {
735 (*alternatives_)[i].set_node(replacement);
736 surviving++;
737 survivor = replacement;
738 }
739 }
740 if (surviving < 2) return set_replacement(survivor);
741
742 set_replacement(this);
743 if (surviving == choice_count) {
744 return this;
745 }
746 // Only some of the nodes survived the filtering. We need to rebuild the
747 // alternatives list.
748 ZoneGrowableArray<GuardedAlternative>* new_alternatives =
749 new(I) ZoneGrowableArray<GuardedAlternative>(surviving);
750 for (intptr_t i = 0; i < choice_count; i++) {
751 RegExpNode* replacement =
752 (*alternatives_)[i].node()->FilterASCII(depth - 1, ignore_case);
753 if (replacement != NULL) {
754 (*alternatives_)[i].set_node(replacement);
755 new_alternatives->Add((*alternatives_)[i]);
756 }
757 }
758 alternatives_ = new_alternatives;
759 return this;
760 }
761
762
763 RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(intptr_t depth,
764 bool ignore_case) {
765 if (info()->replacement_calculated) return replacement();
766 if (depth < 0) return this;
767 if (info()->visited) return this;
768 VisitMarker marker(info());
769 // Alternative 0 is the negative lookahead, alternative 1 is what comes
770 // afterwards.
771 RegExpNode* node = (*alternatives_)[1].node();
772 RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case);
773 if (replacement == NULL) return set_replacement(NULL);
774 (*alternatives_)[1].set_node(replacement);
775
776 RegExpNode* neg_node = (*alternatives_)[0].node();
777 RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case);
778 // If the negative lookahead is always going to fail then
779 // we don't need to check it.
780 if (neg_replacement == NULL) return set_replacement(replacement);
781 (*alternatives_)[0].set_node(neg_replacement);
782 return set_replacement(this);
783 }
784
785
786 // Code emission ---------------------------------------------------------------
787
788
789 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
790 Guard* guard,
791 Trace* trace) {
792 switch (guard->op()) {
793 case Guard::LT:
794 ASSERT(!trace->mentions_reg(guard->reg()));
795 macro_assembler->IfRegisterGE(guard->reg(),
796 guard->value(),
797 trace->backtrack());
798 break;
799 case Guard::GEQ:
800 ASSERT(!trace->mentions_reg(guard->reg()));
801 macro_assembler->IfRegisterLT(guard->reg(),
802 guard->value(),
803 trace->backtrack());
804 break;
805 }
806 }
807
808
809 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
810 RegExpMacroAssembler* assembler = compiler->macro_assembler();
811
812 // Omit flushing the trace. We discard the entire stack frame anyway.
813
814 if (!label()->IsBound()) {
815 // We are completely independent of the trace, since we ignore it,
816 // so this code can be used as the generic version.
817 assembler->BindBlock(label());
818 }
819
820 // Throw away everything on the backtrack stack since the start
821 // of the negative submatch and restore the character position.
822 assembler->ReadCurrentPositionFromRegister(current_position_register_);
823 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
824 if (clear_capture_count_ > 0) {
825 // Clear any captures that might have been performed during the success
826 // of the body of the negative look-ahead.
827 intptr_t clear_capture_end =
828 clear_capture_start_ + clear_capture_count_ - 1;
829 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
830 }
831 // Now that we have unwound the stack we find at the top of the stack the
832 // backtrack that the BeginSubmatch node got.
833 assembler->Backtrack();
834 }
835
836
837 bool Trace::GetStoredPosition(intptr_t reg, intptr_t* cp_offset) {
838 ASSERT(*cp_offset == 0);
839 for (DeferredAction* action = actions_;
840 action != NULL;
841 action = action->next()) {
842 if (action->Mentions(reg)) {
843 if (action->action_type() == ActionNode::STORE_POSITION) {
844 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
845 return true;
846 } else {
847 return false;
848 }
849 }
850 }
851 return false;
852 }
853
854
855 // This is called as we come into a loop choice node and some other tricky
856 // nodes. It normalizes the state of the code generator to ensure we can
857 // generate generic code.
858 intptr_t Trace::FindAffectedRegisters(OutSet* affected_registers,
859 Isolate* isolate) {
860 intptr_t max_register = RegExpCompiler::kNoRegister;
861 for (DeferredAction* action = actions_;
862 action != NULL;
863 action = action->next()) {
864 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
865 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
866 for (intptr_t i = range.from(); i <= range.to(); i++)
867 affected_registers->Set(i, isolate);
868 if (range.to() > max_register) max_register = range.to();
869 } else {
870 affected_registers->Set(action->reg(), isolate);
871 if (action->reg() > max_register) max_register = action->reg();
872 }
873 }
874 return max_register;
875 }
876
877
878 bool Trace::DeferredAction::Mentions(intptr_t that) {
879 if (action_type() == ActionNode::CLEAR_CAPTURES) {
880 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
881 return range.Contains(that);
882 } else {
883 return reg() == that;
884 }
885 }
886
887
888 bool Trace::mentions_reg(intptr_t reg) {
889 for (DeferredAction* action = actions_;
890 action != NULL;
891 action = action->next()) {
892 if (action->Mentions(reg))
893 return true;
894 }
895 return false;
896 }
897
898
899 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
900 intptr_t max_register,
901 const OutSet& registers_to_pop,
902 const OutSet& registers_to_clear) {
903 for (intptr_t reg = max_register; reg >= 0; reg--) {
904 if (registers_to_pop.Get(reg)) {
905 assembler->PopRegister(reg);
906 } else if (registers_to_clear.Get(reg)) {
907 intptr_t clear_to = reg;
908 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
909 reg--;
910 }
911 assembler->ClearRegisters(reg, clear_to);
912 }
913 }
914 }
915
916
917 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
918 intptr_t max_register,
919 const OutSet& affected_registers,
920 OutSet* registers_to_pop,
921 OutSet* registers_to_clear,
922 Isolate* isolate) {
923 for (intptr_t reg = 0; reg <= max_register; reg++) {
924 if (!affected_registers.Get(reg)) {
925 continue;
926 }
927
928 // The chronologically first deferred action in the trace
929 // is used to infer the action needed to restore a register
930 // to its previous state (or not, if it's safe to ignore it).
931 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
932 DeferredActionUndoType undo_action = IGNORE;
933
934 intptr_t value = 0;
935 bool absolute = false;
936 bool clear = false;
937 intptr_t store_position = -1;
938 // This is a little tricky because we are scanning the actions in reverse
939 // historical order (newest first).
940 for (DeferredAction* action = actions_;
941 action != NULL;
942 action = action->next()) {
943 if (action->Mentions(reg)) {
944 switch (action->action_type()) {
945 case ActionNode::SET_REGISTER: {
946 Trace::DeferredSetRegister* psr =
947 static_cast<Trace::DeferredSetRegister*>(action);
948 if (!absolute) {
949 value += psr->value();
950 absolute = true;
951 }
952 // SET_REGISTER is currently only used for newly introduced loop
953 // counters. They can have a significant previous value if they
954 // occour in a loop. TODO(lrn): Propagate this information, so
955 // we can set undo_action to IGNORE if we know there is no value to
956 // restore.
957 undo_action = RESTORE;
958 ASSERT(store_position == -1);
959 ASSERT(!clear);
960 break;
961 }
962 case ActionNode::INCREMENT_REGISTER:
963 if (!absolute) {
964 value++;
965 }
966 ASSERT(store_position == -1);
967 ASSERT(!clear);
968 undo_action = RESTORE;
969 break;
970 case ActionNode::STORE_POSITION: {
971 Trace::DeferredCapture* pc =
972 static_cast<Trace::DeferredCapture*>(action);
973 if (!clear && store_position == -1) {
974 store_position = pc->cp_offset();
975 }
976
977 // For captures we know that stores and clears alternate.
978 // Other register, are never cleared, and if the occur
979 // inside a loop, they might be assigned more than once.
980 if (reg <= 1) {
981 // Registers zero and one, aka "capture zero", is
982 // always set correctly if we succeed. There is no
983 // need to undo a setting on backtrack, because we
984 // will set it again or fail.
985 undo_action = IGNORE;
986 } else {
987 undo_action = pc->is_capture() ? CLEAR : RESTORE;
988 }
989 ASSERT(!absolute);
990 ASSERT(value == 0);
991 break;
992 }
993 case ActionNode::CLEAR_CAPTURES: {
994 // Since we're scanning in reverse order, if we've already
995 // set the position we have to ignore historically earlier
996 // clearing operations.
997 if (store_position == -1) {
998 clear = true;
999 }
1000 undo_action = RESTORE;
1001 ASSERT(!absolute);
1002 ASSERT(value == 0);
1003 break;
1004 }
1005 default:
1006 UNREACHABLE();
1007 break;
1008 }
1009 }
1010 }
1011 // Prepare for the undo-action (e.g., push if it's going to be popped).
1012 if (undo_action == RESTORE) {
1013 assembler->PushRegister(reg);
1014 registers_to_pop->Set(reg, isolate);
1015 } else if (undo_action == CLEAR) {
1016 registers_to_clear->Set(reg, isolate);
1017 }
1018 // Perform the chronologically last action (or accumulated increment)
1019 // for the register.
1020 if (store_position != -1) {
1021 assembler->WriteCurrentPositionToRegister(reg, store_position);
1022 } else if (clear) {
1023 assembler->ClearRegisters(reg, reg);
1024 } else if (absolute) {
1025 assembler->SetRegister(reg, value);
1026 } else if (value != 0) {
1027 assembler->AdvanceRegister(reg, value);
1028 }
1029 }
1030 }
1031
1032
1033 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1034 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1035
1036 ASSERT(!is_trivial());
1037
1038 if (actions_ == NULL && backtrack() == NULL) {
1039 // Here we just have some deferred cp advances to fix and we are back to
1040 // a normal situation. We may also have to forget some information gained
1041 // through a quick check that was already performed.
1042 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1043 // Create a new trivial state and generate the node with that.
1044 Trace new_state;
1045 successor->Emit(compiler, &new_state);
1046 return;
1047 }
1048
1049 // Generate deferred actions here along with code to undo them again.
1050 OutSet affected_registers;
1051
1052 if (backtrack() != NULL) {
1053 // Here we have a concrete backtrack location. These are set up by choice
1054 // nodes and so they indicate that we have a deferred save of the current
1055 // position which we may need to emit here.
1056 assembler->PushCurrentPosition();
1057 }
1058
1059 intptr_t max_register = FindAffectedRegisters(&affected_registers, CI);
1060 OutSet registers_to_pop;
1061 OutSet registers_to_clear;
1062 PerformDeferredActions(assembler,
1063 max_register,
1064 affected_registers,
1065 &registers_to_pop,
1066 &registers_to_clear,
1067 CI);
1068 if (cp_offset_ != 0) {
1069 assembler->AdvanceCurrentPosition(cp_offset_);
1070 }
1071
1072 // Create a new trivial state and generate the node with that.
1073 BlockLabel undo;
1074 assembler->PushBacktrack(&undo);
1075 Trace new_state;
1076 successor->Emit(compiler, &new_state);
1077
1078 // On backtrack we need to restore state.
1079 assembler->BindBlock(&undo);
1080 RestoreAffectedRegisters(assembler,
1081 max_register,
1082 registers_to_pop,
1083 registers_to_clear);
1084 if (backtrack() == NULL) {
1085 assembler->Backtrack();
1086 } else {
1087 assembler->PopCurrentPosition();
1088 assembler->GoTo(backtrack());
1089 }
1090 }
1091
1092
1093 void Trace::InvalidateCurrentCharacter() {
1094 characters_preloaded_ = 0;
1095 }
1096
1097
1098 void Trace::AdvanceCurrentPositionInTrace(intptr_t by,
1099 RegExpCompiler* compiler) {
1100 ASSERT(by > 0);
1101 // We don't have an instruction for shifting the current character register
1102 // down or for using a shifted value for anything so lets just forget that
1103 // we preloaded any characters into it.
1104 characters_preloaded_ = 0;
1105 // Adjust the offsets of the quick check performed information. This
1106 // information is used to find out what we already determined about the
1107 // characters by means of mask and compare.
1108 quick_check_performed_.Advance(by, compiler->ascii());
1109 cp_offset_ += by;
1110 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
1111 compiler->SetRegExpTooBig();
1112 cp_offset_ = 0;
1113 }
1114 bound_checked_up_to_ = Utils::Maximum(static_cast<intptr_t>(0),
1115 bound_checked_up_to_ - by);
1116 }
1117
1118
1119 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1120 if (!trace->is_trivial()) {
1121 trace->Flush(compiler, this);
1122 return;
1123 }
1124 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1125 if (!label()->IsBound()) {
1126 assembler->BindBlock(label());
1127 }
1128 switch (action_) {
1129 case ACCEPT:
1130 assembler->Succeed();
1131 return;
1132 case BACKTRACK:
1133 assembler->GoTo(trace->backtrack());
1134 return;
1135 case NEGATIVE_SUBMATCH_SUCCESS:
1136 // This case is handled in a different virtual method.
1137 UNREACHABLE();
1138 }
1139 UNIMPLEMENTED();
1140 }
1141
1142
1143 bool QuickCheckDetails::Rationalize(bool asc) {
1144 bool found_useful_op = false;
1145 uint32_t char_mask;
1146 if (asc) {
1147 char_mask = Symbols::kMaxOneCharCodeSymbol;
1148 } else {
1149 char_mask = Utf16::kMaxCodeUnit;
1150 }
1151 mask_ = 0;
1152 value_ = 0;
1153 intptr_t char_shift = 0;
1154 for (intptr_t i = 0; i < characters_; i++) {
1155 Position* pos = &positions_[i];
1156 if ((pos->mask & Symbols::kMaxOneCharCodeSymbol) != 0) {
1157 found_useful_op = true;
1158 }
1159 mask_ |= (pos->mask & char_mask) << char_shift;
1160 value_ |= (pos->value & char_mask) << char_shift;
1161 char_shift += asc ? 8 : 16;
1162 }
1163 return found_useful_op;
1164 }
1165
1166
1167 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1168 Trace* trace,
1169 bool preload_has_checked_bounds,
1170 BlockLabel* on_possible_success,
1171 QuickCheckDetails* details,
1172 bool fall_through_on_failure) {
1173 if (details->characters() == 0) return false;
1174 GetQuickCheckDetails(
1175 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
1176 if (details->cannot_match()) return false;
1177 if (!details->Rationalize(compiler->ascii())) return false;
1178 ASSERT(details->characters() == 1 ||
1179 compiler->macro_assembler()->CanReadUnaligned());
1180 uint32_t mask = details->mask();
1181 uint32_t value = details->value();
1182
1183 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1184
1185 if (trace->characters_preloaded() != details->characters()) {
1186 assembler->LoadCurrentCharacter(trace->cp_offset(),
1187 trace->backtrack(),
1188 !preload_has_checked_bounds,
1189 details->characters());
1190 }
1191
1192
1193 bool need_mask = true;
1194
1195 if (details->characters() == 1) {
1196 // If number of characters preloaded is 1 then we used a byte or 16 bit
1197 // load so the value is already masked down.
1198 uint32_t char_mask;
1199 if (compiler->ascii()) {
1200 char_mask = Symbols::kMaxOneCharCodeSymbol;
1201 } else {
1202 char_mask = Utf16::kMaxCodeUnit;
1203 }
1204 if ((mask & char_mask) == char_mask) need_mask = false;
1205 mask &= char_mask;
1206 } else {
1207 // For 2-character preloads in ASCII mode or 1-character preloads in
1208 // TWO_BYTE mode we also use a 16 bit load with zero extend.
1209 if (details->characters() == 2 && compiler->ascii()) {
1210 if ((mask & 0xffff) == 0xffff) need_mask = false;
1211 } else if (details->characters() == 1 && !compiler->ascii()) {
1212 if ((mask & 0xffff) == 0xffff) need_mask = false;
1213 } else {
1214 if (mask == 0xffffffff) need_mask = false;
1215 }
1216 }
1217
1218 if (fall_through_on_failure) {
1219 if (need_mask) {
1220 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1221 } else {
1222 assembler->CheckCharacter(value, on_possible_success);
1223 }
1224 } else {
1225 if (need_mask) {
1226 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1227 } else {
1228 assembler->CheckNotCharacter(value, trace->backtrack());
1229 }
1230 }
1231 return true;
1232 }
1233
1234
1235 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
1236 // that matches newline or the start of input).
1237 static void EmitHat(RegExpCompiler* compiler,
1238 RegExpNode* on_success,
1239 Trace* trace) {
1240 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1241 // We will be loading the previous character into the current character
1242 // register.
1243 Trace new_trace(*trace);
1244 new_trace.InvalidateCurrentCharacter();
1245
1246 BlockLabel ok;
1247 if (new_trace.cp_offset() == 0) {
1248 // The start of input counts as a newline in this context, so skip to
1249 // ok if we are at the start.
1250 assembler->CheckAtStart(&ok);
1251 }
1252 // We already checked that we are not at the start of input so it must be
1253 // OK to load the previous character.
1254 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
1255 new_trace.backtrack(),
1256 false);
1257 if (!assembler->CheckSpecialCharacterClass('n',
1258 new_trace.backtrack())) {
1259 // Newline means \n, \r, 0x2028 or 0x2029.
1260 if (!compiler->ascii()) {
1261 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
1262 }
1263 assembler->CheckCharacter('\n', &ok);
1264 assembler->CheckNotCharacter('\r', new_trace.backtrack());
1265 }
1266 assembler->BindBlock(&ok);
1267 on_success->Emit(compiler, &new_trace);
1268 }
1269
1270
1271 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1272 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1273 switch (assertion_type_) {
1274 case AT_END: {
1275 BlockLabel ok;
1276 assembler->CheckPosition(trace->cp_offset(), &ok);
1277 assembler->GoTo(trace->backtrack());
1278 assembler->BindBlock(&ok);
1279 break;
1280 }
1281 case AT_START: {
1282 if (trace->at_start() == Trace::FALSE_VALUE) {
1283 assembler->GoTo(trace->backtrack());
1284 return;
1285 }
1286 if (trace->at_start() == Trace::UNKNOWN) {
1287 assembler->CheckNotAtStart(trace->backtrack());
1288 Trace at_start_trace = *trace;
1289 at_start_trace.set_at_start(true);
1290 on_success()->Emit(compiler, &at_start_trace);
1291 return;
1292 }
1293 }
1294 break;
1295 case AFTER_NEWLINE:
1296 EmitHat(compiler, on_success(), trace);
1297 return;
1298 case AT_BOUNDARY:
1299 case AT_NON_BOUNDARY: {
1300 EmitBoundaryCheck(compiler, trace);
1301 return;
1302 }
1303 }
1304 on_success()->Emit(compiler, trace);
1305 }
1306
1307
1308 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1309 Trace* trace) {
1310 // If we are generating a greedy loop then don't stop and don't reuse code.
1311 if (trace->stop_node() != NULL) {
1312 return CONTINUE;
1313 }
1314
1315 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1316 if (trace->is_trivial()) {
1317 if (label_.IsBound()) {
1318 // We are being asked to generate a generic version, but that's already
1319 // been done so just go to it.
1320 macro_assembler->GoTo(&label_);
1321 return DONE;
1322 }
1323 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1324 // To avoid too deep recursion we push the node to the work queue and just
1325 // generate a goto here.
1326 compiler->AddWork(this);
1327 macro_assembler->GoTo(&label_);
1328 return DONE;
1329 }
1330 // Generate generic version of the node and bind the label for later use.
1331 macro_assembler->BindBlock(&label_);
1332 return CONTINUE;
1333 }
1334
1335 // We are being asked to make a non-generic version. Keep track of how many
1336 // non-generic versions we generate so as not to overdo it.
1337 trace_count_++;
1338 if (kRegexpOptimization &&
1339 trace_count_ < kMaxCopiesCodeGenerated &&
1340 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1341 return CONTINUE;
1342 }
1343
1344 // If we get here code has been generated for this node too many times or
1345 // recursion is too deep. Time to switch to a generic version. The code for
1346 // generic versions above can handle deep recursion properly.
1347 trace->Flush(compiler, this);
1348 return DONE;
1349 }
1350
1351
1352 // This generates the code to match a text node. A text node can contain
1353 // straight character sequences (possibly to be matched in a case-independent
1354 // way) and character classes. For efficiency we do not do this in a single
1355 // pass from left to right. Instead we pass over the text node several times,
1356 // emitting code for some character positions every time. See the comment on
1357 // TextEmitPass for details.
1358 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1359 LimitResult limit_result = LimitVersions(compiler, trace);
1360 if (limit_result == DONE) return;
1361 ASSERT(limit_result == CONTINUE);
1362
1363 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
1364 compiler->SetRegExpTooBig();
1365 return;
1366 }
1367
1368 if (compiler->ascii()) {
1369 intptr_t dummy = 0;
1370 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
1371 }
1372
1373 bool first_elt_done = false;
1374 intptr_t bound_checked_to = trace->cp_offset() - 1;
1375 bound_checked_to += trace->bound_checked_up_to();
1376
1377 // If a character is preloaded into the current character register then
1378 // check that now.
1379 if (trace->characters_preloaded() == 1) {
1380 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {
1381 if (!SkipPass(pass, compiler->ignore_case())) {
1382 TextEmitPass(compiler,
1383 static_cast<TextEmitPassType>(pass),
1384 true,
1385 trace,
1386 false,
1387 &bound_checked_to);
1388 }
1389 }
1390 first_elt_done = true;
1391 }
1392
1393 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {
1394 if (!SkipPass(pass, compiler->ignore_case())) {
1395 TextEmitPass(compiler,
1396 static_cast<TextEmitPassType>(pass),
1397 false,
1398 trace,
1399 first_elt_done,
1400 &bound_checked_to);
1401 }
1402 }
1403
1404 Trace successor_trace(*trace);
1405 successor_trace.set_at_start(false);
1406 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
1407 RecursionCheck rc(compiler);
1408 on_success()->Emit(compiler, &successor_trace);
1409 }
1410
1411
1412 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1413 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1414 if (trace->stop_node() == this) {
1415 intptr_t text_length =
1416 GreedyLoopTextLengthForAlternative(&((*alternatives_)[0]));
1417 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
1418 // Update the counter-based backtracking info on the stack. This is an
1419 // optimization for greedy loops (see below).
1420 ASSERT(trace->cp_offset() == text_length);
1421 macro_assembler->AdvanceCurrentPosition(text_length);
1422 macro_assembler->GoTo(trace->loop_label());
1423 return;
1424 }
1425 ASSERT(trace->stop_node() == NULL);
1426 if (!trace->is_trivial()) {
1427 trace->Flush(compiler, this);
1428 return;
1429 }
1430 ChoiceNode::Emit(compiler, trace);
1431 }
1432
1433
1434 // This class is used when generating the alternatives in a choice node. It
1435 // records the way the alternative is being code generated.
1436 struct AlternativeGeneration {
1437 AlternativeGeneration()
1438 : possible_success(),
1439 expects_preload(false),
1440 after(),
1441 quick_check_details() { }
1442 BlockLabel possible_success;
1443 bool expects_preload;
1444 BlockLabel after;
1445 QuickCheckDetails quick_check_details;
1446 };
1447
1448
1449 // Creates a list of AlternativeGenerations. If the list has a reasonable
1450 // size then it is on the stack, otherwise the excess is on the heap.
1451 class AlternativeGenerationList {
1452 public:
1453 explicit AlternativeGenerationList(intptr_t count)
1454 : alt_gens_(count) {
1455 for (intptr_t i = 0; i < count && i < kAFew; i++) {
1456 alt_gens_.Add(a_few_alt_gens_ + i);
1457 }
1458 for (intptr_t i = kAFew; i < count; i++) {
1459 alt_gens_.Add(new AlternativeGeneration());
1460 }
1461 }
1462 ~AlternativeGenerationList() {
1463 for (intptr_t i = kAFew; i < alt_gens_.length(); i++) {
1464 delete alt_gens_[i];
1465 alt_gens_[i] = NULL;
1466 }
1467 }
1468
1469 AlternativeGeneration* at(intptr_t i) {
1470 return alt_gens_[i];
1471 }
1472
1473 private:
1474 static const intptr_t kAFew = 10;
1475 GrowableArray<AlternativeGeneration*> alt_gens_;
1476 AlternativeGeneration a_few_alt_gens_[kAFew];
1477
1478 DISALLOW_ALLOCATION();
1479 };
1480
1481
1482 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1483 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1484 intptr_t choice_count = alternatives_->length();
1485
1486 #ifdef DEBUG
1487 for (intptr_t i = 0; i < choice_count - 1; i++) {
1488 GuardedAlternative alternative = alternatives_->At(i);
1489 ZoneGrowableArray<Guard*>* guards = alternative.guards();
1490 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();
1491 for (intptr_t j = 0; j < guard_count; j++) {
1492 ASSERT(!trace->mentions_reg(guards->At(j)->reg()));
1493 }
1494 }
1495 #endif
1496
1497 LimitResult limit_result = LimitVersions(compiler, trace);
1498 if (limit_result == DONE) return;
1499 ASSERT(limit_result == CONTINUE);
1500
1501 intptr_t new_flush_budget = trace->flush_budget() / choice_count;
1502 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
1503 trace->Flush(compiler, this);
1504 return;
1505 }
1506
1507 RecursionCheck rc(compiler);
1508
1509 Trace* current_trace = trace;
1510
1511 intptr_t text_length =
1512 GreedyLoopTextLengthForAlternative(&((*alternatives_)[0]));
1513 bool greedy_loop = false;
1514 BlockLabel greedy_loop_label;
1515 Trace counter_backtrack_trace;
1516 counter_backtrack_trace.set_backtrack(&greedy_loop_label);
1517 if (not_at_start()) counter_backtrack_trace.set_at_start(false);
1518
1519 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
1520 // Here we have special handling for greedy loops containing only text nodes
1521 // and other simple nodes. These are handled by pushing the current
1522 // position on the stack and then incrementing the current position each
1523 // time around the switch. On backtrack we decrement the current position
1524 // and check it against the pushed value. This avoids pushing backtrack
1525 // information for each iteration of the loop, which could take up a lot of
1526 // space.
1527 greedy_loop = true;
1528 ASSERT(trace->stop_node() == NULL);
1529 macro_assembler->PushCurrentPosition();
1530 current_trace = &counter_backtrack_trace;
1531 BlockLabel greedy_match_failed;
1532 Trace greedy_match_trace;
1533 if (not_at_start()) greedy_match_trace.set_at_start(false);
1534 greedy_match_trace.set_backtrack(&greedy_match_failed);
1535 BlockLabel loop_label;
1536 macro_assembler->BindBlock(&loop_label);
1537 greedy_match_trace.set_stop_node(this);
1538 greedy_match_trace.set_loop_label(&loop_label);
1539 (*alternatives_)[0].node()->Emit(compiler, &greedy_match_trace);
1540 macro_assembler->BindBlock(&greedy_match_failed);
1541 }
1542
1543 BlockLabel second_choice; // For use in greedy matches.
1544 macro_assembler->BindBlock(&second_choice);
1545
1546 intptr_t first_normal_choice = greedy_loop ? 1 : 0;
1547
1548 bool not_at_start = current_trace->at_start() == Trace::FALSE_VALUE;
1549 const intptr_t kEatsAtLeastNotYetInitialized = -1;
1550 intptr_t eats_at_least = kEatsAtLeastNotYetInitialized;
1551
1552 bool skip_was_emitted = false;
1553
1554 if (!greedy_loop && choice_count == 2) {
1555 GuardedAlternative alt1 = (*alternatives_)[1];
1556 if (alt1.guards() == NULL || alt1.guards()->length() == 0) {
1557 RegExpNode* eats_anything_node = alt1.node();
1558 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==
1559 this) {
1560 // At this point we know that we are at a non-greedy loop that will eat
1561 // any character one at a time. Any non-anchored regexp has such a
1562 // loop prepended to it in order to find where it starts. We look for
1563 // a pattern of the form ...abc... where we can look 6 characters ahead
1564 // and step forwards 3 if the character is not one of abc. Abc need
1565 // not be atoms, they can be any reasonably limited character class or
1566 // small alternation.
1567 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.
1568 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
1569 if (lookahead == NULL) {
1570 eats_at_least = Utils::Minimum(kMaxLookaheadForBoyerMoore,
1571 EatsAtLeast(kMaxLookaheadForBoyerMoore,
1572 kRecursionBudget,
1573 not_at_start));
1574 if (eats_at_least >= 1) {
1575 BoyerMooreLookahead* bm =
1576 new(I) BoyerMooreLookahead(eats_at_least, compiler, I);
1577 GuardedAlternative alt0 = alternatives_->At(0);
1578 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
1579 skip_was_emitted = bm->EmitSkipInstructions(macro_assembler);
1580 }
1581 } else {
1582 skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler);
1583 }
1584 }
1585 }
1586 }
1587
1588 if (eats_at_least == kEatsAtLeastNotYetInitialized) {
1589 // Save some time by looking at most one machine word ahead.
1590 eats_at_least =
1591 EatsAtLeast(compiler->ascii() ? 4 : 2, kRecursionBudget, not_at_start);
1592 }
1593 intptr_t preload_characters =
1594 CalculatePreloadCharacters(compiler, eats_at_least);
1595
1596 bool preload_is_current = !skip_was_emitted &&
1597 (current_trace->characters_preloaded() == preload_characters);
1598 bool preload_has_checked_bounds = preload_is_current;
1599
1600 AlternativeGenerationList alt_gens(choice_count);
1601
1602 // For now we just call all choices one after the other. The idea ultimately
1603 // is to use the Dispatch table to try only the relevant ones.
1604 for (intptr_t i = first_normal_choice; i < choice_count; i++) {
1605 GuardedAlternative alternative = alternatives_->At(i);
1606 AlternativeGeneration* alt_gen = alt_gens.at(i);
1607 alt_gen->quick_check_details.set_characters(preload_characters);
1608 ZoneGrowableArray<Guard*>* guards = alternative.guards();
1609 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();
1610 Trace new_trace(*current_trace);
1611 new_trace.set_characters_preloaded(preload_is_current ?
1612 preload_characters :
1613 0);
1614 if (preload_has_checked_bounds) {
1615 new_trace.set_bound_checked_up_to(preload_characters);
1616 }
1617 new_trace.quick_check_performed()->Clear();
1618 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
1619 alt_gen->expects_preload = preload_is_current;
1620 bool generate_full_check_inline = false;
1621 if (kRegexpOptimization &&
1622 try_to_emit_quick_check_for_alternative(i) &&
1623 alternative.node()->EmitQuickCheck(compiler,
1624 &new_trace,
1625 preload_has_checked_bounds,
1626 &alt_gen->possible_success,
1627 &alt_gen->quick_check_details,
1628 i < choice_count - 1)) {
1629 // Quick check was generated for this choice.
1630 preload_is_current = true;
1631 preload_has_checked_bounds = true;
1632 // On the last choice in the ChoiceNode we generated the quick
1633 // check to fall through on possible success. So now we need to
1634 // generate the full check inline.
1635 if (i == choice_count - 1) {
1636 macro_assembler->BindBlock(&alt_gen->possible_success);
1637 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
1638 new_trace.set_characters_preloaded(preload_characters);
1639 new_trace.set_bound_checked_up_to(preload_characters);
1640 generate_full_check_inline = true;
1641 }
1642 } else if (alt_gen->quick_check_details.cannot_match()) {
1643 if (i == choice_count - 1 && !greedy_loop) {
1644 macro_assembler->GoTo(trace->backtrack());
1645 }
1646 continue;
1647 } else {
1648 // No quick check was generated. Put the full code here.
1649 // If this is not the first choice then there could be slow checks from
1650 // previous cases that go here when they fail. There's no reason to
1651 // insist that they preload characters since the slow check we are about
1652 // to generate probably can't use it.
1653 if (i != first_normal_choice) {
1654 alt_gen->expects_preload = false;
1655 new_trace.InvalidateCurrentCharacter();
1656 }
1657 if (i < choice_count - 1) {
1658 new_trace.set_backtrack(&alt_gen->after);
1659 }
1660 generate_full_check_inline = true;
1661 }
1662 if (generate_full_check_inline) {
1663 if (new_trace.actions() != NULL) {
1664 new_trace.set_flush_budget(new_flush_budget);
1665 }
1666 for (intptr_t j = 0; j < guard_count; j++) {
1667 GenerateGuard(macro_assembler, guards->At(j), &new_trace);
1668 }
1669 alternative.node()->Emit(compiler, &new_trace);
1670 preload_is_current = false;
1671 }
1672 macro_assembler->BindBlock(&alt_gen->after);
1673 }
1674 if (greedy_loop) {
1675 macro_assembler->BindBlock(&greedy_loop_label);
1676 // If we have unwound to the bottom then backtrack.
1677 macro_assembler->CheckGreedyLoop(trace->backtrack());
1678 // Otherwise try the second priority at an earlier position.
1679 macro_assembler->AdvanceCurrentPosition(-text_length);
1680 macro_assembler->GoTo(&second_choice);
1681 }
1682
1683 // At this point we need to generate slow checks for the alternatives where
1684 // the quick check was inlined. We can recognize these because the associated
1685 // label was bound.
1686 for (intptr_t i = first_normal_choice; i < choice_count - 1; i++) {
1687 AlternativeGeneration* alt_gen = alt_gens.at(i);
1688 Trace new_trace(*current_trace);
1689 // If there are actions to be flushed we have to limit how many times
1690 // they are flushed. Take the budget of the parent trace and distribute
1691 // it fairly amongst the children.
1692 if (new_trace.actions() != NULL) {
1693 new_trace.set_flush_budget(new_flush_budget);
1694 }
1695 EmitOutOfLineContinuation(compiler,
1696 &new_trace,
1697 alternatives_->At(i),
1698 alt_gen,
1699 preload_characters,
1700 alt_gens.at(i + 1)->expects_preload);
1701 }
1702 }
1703
1704
1705 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
1706 Trace* trace,
1707 GuardedAlternative alternative,
1708 AlternativeGeneration* alt_gen,
1709 intptr_t preload_characters,
1710 bool next_expects_preload) {
1711 if (!alt_gen->possible_success.IsLinked()) return;
1712
1713 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1714 macro_assembler->BindBlock(&alt_gen->possible_success);
1715 Trace out_of_line_trace(*trace);
1716 out_of_line_trace.set_characters_preloaded(preload_characters);
1717 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
1718 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
1719 ZoneGrowableArray<Guard*>* guards = alternative.guards();
1720 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();
1721 if (next_expects_preload) {
1722 BlockLabel reload_current_char;
1723 out_of_line_trace.set_backtrack(&reload_current_char);
1724 for (intptr_t j = 0; j < guard_count; j++) {
1725 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);
1726 }
1727 alternative.node()->Emit(compiler, &out_of_line_trace);
1728 macro_assembler->BindBlock(&reload_current_char);
1729 // Reload the current character, since the next quick check expects that.
1730 // We don't need to check bounds here because we only get into this
1731 // code through a quick check which already did the checked load.
1732 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
1733 NULL,
1734 false,
1735 preload_characters);
1736 macro_assembler->GoTo(&(alt_gen->after));
1737 } else {
1738 out_of_line_trace.set_backtrack(&(alt_gen->after));
1739 for (intptr_t j = 0; j < guard_count; j++) {
1740 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);
1741 }
1742 alternative.node()->Emit(compiler, &out_of_line_trace);
1743 }
1744 }
1745
1746
1747 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1748 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1749 LimitResult limit_result = LimitVersions(compiler, trace);
1750 if (limit_result == DONE) return;
1751 ASSERT(limit_result == CONTINUE);
1752
1753 RecursionCheck rc(compiler);
1754
1755 switch (action_type_) {
1756 case STORE_POSITION: {
1757 Trace::DeferredCapture
1758 new_capture(data_.u_position_register.reg,
1759 data_.u_position_register.is_capture,
1760 trace);
1761 Trace new_trace = *trace;
1762 new_trace.add_action(&new_capture);
1763 on_success()->Emit(compiler, &new_trace);
1764 break;
1765 }
1766 case INCREMENT_REGISTER: {
1767 Trace::DeferredIncrementRegister
1768 new_increment(data_.u_increment_register.reg);
1769 Trace new_trace = *trace;
1770 new_trace.add_action(&new_increment);
1771 on_success()->Emit(compiler, &new_trace);
1772 break;
1773 }
1774 case SET_REGISTER: {
1775 Trace::DeferredSetRegister
1776 new_set(data_.u_store_register.reg, data_.u_store_register.value);
1777 Trace new_trace = *trace;
1778 new_trace.add_action(&new_set);
1779 on_success()->Emit(compiler, &new_trace);
1780 break;
1781 }
1782 case CLEAR_CAPTURES: {
1783 Trace::DeferredClearCaptures
1784 new_capture(Interval(data_.u_clear_captures.range_from,
1785 data_.u_clear_captures.range_to));
1786 Trace new_trace = *trace;
1787 new_trace.add_action(&new_capture);
1788 on_success()->Emit(compiler, &new_trace);
1789 break;
1790 }
1791 case BEGIN_SUBMATCH:
1792 if (!trace->is_trivial()) {
1793 trace->Flush(compiler, this);
1794 } else {
1795 assembler->WriteCurrentPositionToRegister(
1796 data_.u_submatch.current_position_register, 0);
1797 assembler->WriteStackPointerToRegister(
1798 data_.u_submatch.stack_pointer_register);
1799 on_success()->Emit(compiler, trace);
1800 }
1801 break;
1802 case EMPTY_MATCH_CHECK: {
1803 intptr_t start_pos_reg = data_.u_empty_match_check.start_register;
1804 intptr_t stored_pos = 0;
1805 intptr_t rep_reg = data_.u_empty_match_check.repetition_register;
1806 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
1807 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
1808 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
1809 // If we know we haven't advanced and there is no minimum we
1810 // can just backtrack immediately.
1811 assembler->GoTo(trace->backtrack());
1812 } else if (know_dist && stored_pos < trace->cp_offset()) {
1813 // If we know we've advanced we can generate the continuation
1814 // immediately.
1815 on_success()->Emit(compiler, trace);
1816 } else if (!trace->is_trivial()) {
1817 trace->Flush(compiler, this);
1818 } else {
1819 BlockLabel skip_empty_check;
1820 // If we have a minimum number of repetitions we check the current
1821 // number first and skip the empty check if it's not enough.
1822 if (has_minimum) {
1823 intptr_t limit = data_.u_empty_match_check.repetition_limit;
1824 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
1825 }
1826 // If the match is empty we bail out, otherwise we fall through
1827 // to the on-success continuation.
1828 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
1829 trace->backtrack());
1830 assembler->BindBlock(&skip_empty_check);
1831 on_success()->Emit(compiler, trace);
1832 }
1833 break;
1834 }
1835 case POSITIVE_SUBMATCH_SUCCESS: {
1836 if (!trace->is_trivial()) {
1837 trace->Flush(compiler, this);
1838 return;
1839 }
1840 assembler->ReadCurrentPositionFromRegister(
1841 data_.u_submatch.current_position_register);
1842 assembler->ReadStackPointerFromRegister(
1843 data_.u_submatch.stack_pointer_register);
1844 intptr_t clear_register_count = data_.u_submatch.clear_register_count;
1845 if (clear_register_count == 0) {
1846 on_success()->Emit(compiler, trace);
1847 return;
1848 }
1849 intptr_t clear_registers_from = data_.u_submatch.clear_register_from;
1850 BlockLabel clear_registers_backtrack;
1851 Trace new_trace = *trace;
1852 new_trace.set_backtrack(&clear_registers_backtrack);
1853 on_success()->Emit(compiler, &new_trace);
1854
1855 assembler->BindBlock(&clear_registers_backtrack);
1856 intptr_t clear_registers_to =
1857 clear_registers_from + clear_register_count - 1;
1858 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
1859
1860 ASSERT(trace->backtrack() == NULL);
1861 assembler->Backtrack();
1862 return;
1863 }
1864 default:
1865 UNREACHABLE();
1866 }
1867 }
1868
1869
1870 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1871 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1872 if (!trace->is_trivial()) {
1873 trace->Flush(compiler, this);
1874 return;
1875 }
1876
1877 LimitResult limit_result = LimitVersions(compiler, trace);
1878 if (limit_result == DONE) return;
1879 ASSERT(limit_result == CONTINUE);
1880
1881 RecursionCheck rc(compiler);
1882
1883 ASSERT(start_reg_ + 1 == end_reg_);
1884 if (compiler->ignore_case()) {
1885 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
1886 trace->backtrack());
1887 } else {
1888 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
1889 }
1890 on_success()->Emit(compiler, trace);
1891 }
1892
1893
1894 void ActionNode::FillInBMInfo(intptr_t offset,
1895 intptr_t budget,
1896 BoyerMooreLookahead* bm,
1897 bool not_at_start) {
1898 if (action_type_ == BEGIN_SUBMATCH) {
1899 bm->SetRest(offset);
1900 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
1901 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
1902 }
1903 SaveBMInfo(bm, not_at_start, offset);
1904 }
1905
1906
1907 void AssertionNode::FillInBMInfo(intptr_t offset,
1908 intptr_t budget,
1909 BoyerMooreLookahead* bm,
1910 bool not_at_start) {
1911 // Match the behaviour of EatsAtLeast on this node.
1912 if (assertion_type() == AT_START && not_at_start) return;
1913 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
1914 SaveBMInfo(bm, not_at_start, offset);
1915 }
1916
1917
1918 void BackReferenceNode::FillInBMInfo(intptr_t offset,
1919 intptr_t budget,
1920 BoyerMooreLookahead* bm,
1921 bool not_at_start) {
1922 // Working out the set of characters that a backreference can match is too
1923 // hard, so we just say that any character can match.
1924 bm->SetRest(offset);
1925 SaveBMInfo(bm, not_at_start, offset);
1926 }
1927
1928
1929 // Returns the number of characters in the equivalence class, omitting those
1930 // that cannot occur in the source string because it is ASCII.
1931 static intptr_t GetCaseIndependentLetters(uint16_t character,
1932 bool ascii_subject,
1933 int32_t* letters) {
1934 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize;
1935 intptr_t length = jsregexp_uncanonicalize.get(character, '\0', letters);
1936 // Unibrow returns 0 or 1 for characters where case independence is
1937 // trivial.
1938 if (length == 0) {
1939 letters[0] = character;
1940 length = 1;
1941 }
1942 if (!ascii_subject || character <= Symbols::kMaxOneCharCodeSymbol) {
1943 return length;
1944 }
1945 // The standard requires that non-ASCII characters cannot have ASCII
1946 // character codes in their equivalence class.
1947 return 0;
1948 }
1949
1950
1951 void ChoiceNode::FillInBMInfo(intptr_t offset,
1952 intptr_t budget,
1953 BoyerMooreLookahead* bm,
1954 bool not_at_start) {
1955 ZoneGrowableArray<GuardedAlternative>* alts = alternatives();
1956 budget = (budget - 1) / alts->length();
1957 for (intptr_t i = 0; i < alts->length(); i++) {
1958 GuardedAlternative& alt = (*alts)[i];
1959 if (alt.guards() != NULL && alt.guards()->length() != 0) {
1960 bm->SetRest(offset); // Give up trying to fill in info.
1961 SaveBMInfo(bm, not_at_start, offset);
1962 return;
1963 }
1964 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start);
1965 }
1966 SaveBMInfo(bm, not_at_start, offset);
1967 }
1968
1969
1970 void EndNode::FillInBMInfo(intptr_t offset,
1971 intptr_t budget,
1972 BoyerMooreLookahead* bm,
1973 bool not_at_start) {
1974 // Returning 0 from EatsAtLeast should ensure we never get here.
1975 UNREACHABLE();
1976 }
1977
1978
1979 void LoopChoiceNode::FillInBMInfo(intptr_t offset,
1980 intptr_t budget,
1981 BoyerMooreLookahead* bm,
1982 bool not_at_start) {
1983 if (body_can_be_zero_length_ || budget <= 0) {
1984 bm->SetRest(offset);
1985 SaveBMInfo(bm, not_at_start, offset);
1986 return;
1987 }
1988 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start);
1989 SaveBMInfo(bm, not_at_start, offset);
1990 }
1991
1992
1993 void TextNode::FillInBMInfo(intptr_t initial_offset,
1994 intptr_t budget,
1995 BoyerMooreLookahead* bm,
1996 bool not_at_start) {
1997 if (initial_offset >= bm->length()) return;
1998 intptr_t offset = initial_offset;
1999 intptr_t max_char = bm->max_char();
2000 for (intptr_t i = 0; i < elements()->length(); i++) {
2001 if (offset >= bm->length()) {
2002 if (initial_offset == 0) set_bm_info(not_at_start, bm);
2003 return;
2004 }
2005 TextElement text = elements()->At(i);
2006 if (text.text_type() == TextElement::ATOM) {
2007 RegExpAtom* atom = text.atom();
2008 for (intptr_t j = 0; j < atom->length(); j++, offset++) {
2009 if (offset >= bm->length()) {
2010 if (initial_offset == 0) set_bm_info(not_at_start, bm);
2011 return;
2012 }
2013 uint16_t character = atom->data()->At(j);
2014 if (bm->compiler()->ignore_case()) {
2015 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2016 intptr_t length = GetCaseIndependentLetters(
2017 character,
2018 bm->max_char() == Symbols::kMaxOneCharCodeSymbol,
2019 chars);
2020 for (intptr_t j = 0; j < length; j++) {
2021 bm->Set(offset, chars[j]);
2022 }
2023 } else {
2024 if (character <= max_char) bm->Set(offset, character);
2025 }
2026 }
2027 } else {
2028 ASSERT(text.text_type() == TextElement::CHAR_CLASS);
2029 RegExpCharacterClass* char_class = text.char_class();
2030 ZoneGrowableArray<CharacterRange>* ranges = char_class->ranges();
2031 if (char_class->is_negated()) {
2032 bm->SetAll(offset);
2033 } else {
2034 for (intptr_t k = 0; k < ranges->length(); k++) {
2035 CharacterRange& range = (*ranges)[k];
2036 if (range.from() > max_char) continue;
2037 intptr_t to = Utils::Minimum(max_char,
2038 static_cast<intptr_t>(range.to()));
2039 bm->SetInterval(offset, Interval(range.from(), to));
2040 }
2041 }
2042 offset++;
2043 }
2044 }
2045 if (offset >= bm->length()) {
2046 if (initial_offset == 0) set_bm_info(not_at_start, bm);
2047 return;
2048 }
2049 on_success()->FillInBMInfo(offset,
2050 budget - 1,
2051 bm,
2052 true); // Not at start after a text node.
2053 if (initial_offset == 0) set_bm_info(not_at_start, bm);
2054 }
2055
2056
2057 // Check for [0-9A-Z_a-z].
2058 static void EmitWordCheck(RegExpMacroAssembler* assembler,
2059 BlockLabel* word,
2060 BlockLabel* non_word,
2061 bool fall_through_on_word) {
2062 if (assembler->CheckSpecialCharacterClass(
2063 fall_through_on_word ? 'w' : 'W',
2064 fall_through_on_word ? non_word : word)) {
2065 // Optimized implementation available.
2066 return;
2067 }
2068 assembler->CheckCharacterGT('z', non_word);
2069 assembler->CheckCharacterLT('0', non_word);
2070 assembler->CheckCharacterGT('a' - 1, word);
2071 assembler->CheckCharacterLT('9' + 1, word);
2072 assembler->CheckCharacterLT('A', non_word);
2073 assembler->CheckCharacterLT('Z' + 1, word);
2074 if (fall_through_on_word) {
2075 assembler->CheckNotCharacter('_', non_word);
2076 } else {
2077 assembler->CheckCharacter('_', word);
2078 }
2079 }
2080
2081
2082 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2083 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2084 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2085 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2086 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2087 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2088 if (lookahead == NULL) {
2089 intptr_t eats_at_least =
2090 Utils::Minimum(kMaxLookaheadForBoyerMoore,
2091 EatsAtLeast(kMaxLookaheadForBoyerMoore,
2092 kRecursionBudget,
2093 not_at_start));
2094 if (eats_at_least >= 1) {
2095 BoyerMooreLookahead* bm =
2096 new(I) BoyerMooreLookahead(eats_at_least, compiler, I);
2097 FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
2098 if (bm->at(0)->is_non_word())
2099 next_is_word_character = Trace::FALSE_VALUE;
2100 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2101 }
2102 } else {
2103 if (lookahead->at(0)->is_non_word())
2104 next_is_word_character = Trace::FALSE_VALUE;
2105 if (lookahead->at(0)->is_word())
2106 next_is_word_character = Trace::TRUE_VALUE;
2107 }
2108 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
2109 if (next_is_word_character == Trace::UNKNOWN) {
2110 BlockLabel before_non_word;
2111 BlockLabel before_word;
2112 if (trace->characters_preloaded() != 1) {
2113 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2114 }
2115 // Fall through on non-word.
2116 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2117 // Next character is not a word character.
2118 assembler->BindBlock(&before_non_word);
2119 BlockLabel ok;
2120 // Backtrack on \B (non-boundary check) if previous is a word,
2121 // since we know next *is not* a word and this would be a boundary.
2122 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2123
2124 if (!assembler->IsClosed()) {
2125 assembler->GoTo(&ok);
2126 }
2127
2128 assembler->BindBlock(&before_word);
2129 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2130 assembler->BindBlock(&ok);
2131 } else if (next_is_word_character == Trace::TRUE_VALUE) {
2132 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2133 } else {
2134 ASSERT(next_is_word_character == Trace::FALSE_VALUE);
2135 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2136 }
2137 }
2138
2139
2140 void AssertionNode::BacktrackIfPrevious(
2141 RegExpCompiler* compiler,
2142 Trace* trace,
2143 AssertionNode::IfPrevious backtrack_if_previous) {
2144 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2145 Trace new_trace(*trace);
2146 new_trace.InvalidateCurrentCharacter();
2147
2148 BlockLabel fall_through, dummy;
2149
2150 BlockLabel* non_word = backtrack_if_previous == kIsNonWord ?
2151 new_trace.backtrack() :
2152 &fall_through;
2153 BlockLabel* word = backtrack_if_previous == kIsNonWord ?
2154 &fall_through :
2155 new_trace.backtrack();
2156
2157 if (new_trace.cp_offset() == 0) {
2158 // The start of input counts as a non-word character, so the question is
2159 // decided if we are at the start.
2160 assembler->CheckAtStart(non_word);
2161 }
2162 // We already checked that we are not at the start of input so it must be
2163 // OK to load the previous character.
2164 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
2165 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
2166
2167 assembler->BindBlock(&fall_through);
2168 on_success()->Emit(compiler, &new_trace);
2169 }
2170
2171
2172 static bool DeterminedAlready(QuickCheckDetails* quick_check, intptr_t offset) {
2173 if (quick_check == NULL) return false;
2174 if (offset >= quick_check->characters()) return false;
2175 return quick_check->positions(offset)->determines_perfectly;
2176 }
2177
2178
2179 static void UpdateBoundsCheck(intptr_t index, intptr_t* checked_up_to) {
2180 if (index > *checked_up_to) {
2181 *checked_up_to = index;
2182 }
2183 }
2184
2185
2186 typedef bool EmitCharacterFunction(Isolate* isolate,
2187 RegExpCompiler* compiler,
2188 uint16_t c,
2189 BlockLabel* on_failure,
2190 intptr_t cp_offset,
2191 bool check,
2192 bool preloaded);
2193
2194
2195 static inline bool EmitSimpleCharacter(Isolate* isolate,
2196 RegExpCompiler* compiler,
2197 uint16_t c,
2198 BlockLabel* on_failure,
2199 intptr_t cp_offset,
2200 bool check,
2201 bool preloaded) {
2202 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2203 bool bound_checked = false;
2204 if (!preloaded) {
2205 assembler->LoadCurrentCharacter(
2206 cp_offset,
2207 on_failure,
2208 check);
2209 bound_checked = true;
2210 }
2211 assembler->CheckNotCharacter(c, on_failure);
2212 return bound_checked;
2213 }
2214
2215
2216 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
2217 bool ascii,
2218 uint16_t c1,
2219 uint16_t c2,
2220 BlockLabel* on_failure) {
2221 uint16_t char_mask;
2222 if (ascii) {
2223 char_mask = Symbols::kMaxOneCharCodeSymbol;
2224 } else {
2225 char_mask = Utf16::kMaxCodeUnit;
2226 }
2227 uint16_t exor = c1 ^ c2;
2228 // Check whether exor has only one bit set.
2229 if (((exor - 1) & exor) == 0) {
2230 // If c1 and c2 differ only by one bit.
2231 // Ecma262UnCanonicalize always gives the highest number last.
2232 ASSERT(c2 > c1);
2233 uint16_t mask = char_mask ^ exor;
2234 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
2235 return true;
2236 }
2237 ASSERT(c2 > c1);
2238 uint16_t diff = c2 - c1;
2239 if (((diff - 1) & diff) == 0 && c1 >= diff) {
2240 // If the characters differ by 2^n but don't differ by one bit then
2241 // subtract the difference from the found character, then do the or
2242 // trick. We avoid the theoretical case where negative numbers are
2243 // involved in order to simplify code generation.
2244 uint16_t mask = char_mask ^ diff;
2245 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
2246 diff,
2247 mask,
2248 on_failure);
2249 return true;
2250 }
2251 return false;
2252 }
2253
2254 // Only emits letters (things that have case). Only used for case independent
2255 // matches.
2256 static inline bool EmitAtomLetter(Isolate* isolate,
2257 RegExpCompiler* compiler,
2258 uint16_t c,
2259 BlockLabel* on_failure,
2260 intptr_t cp_offset,
2261 bool check,
2262 bool preloaded) {
2263 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2264 bool ascii = compiler->ascii();
2265 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2266 intptr_t length = GetCaseIndependentLetters(c, ascii, chars);
2267 if (length <= 1) return false;
2268 // We may not need to check against the end of the input string
2269 // if this character lies before a character that matched.
2270 if (!preloaded) {
2271 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
2272 }
2273 BlockLabel ok;
2274 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
2275 switch (length) {
2276 case 2: {
2277 if (ShortCutEmitCharacterPair(macro_assembler,
2278 ascii,
2279 chars[0],
2280 chars[1],
2281 on_failure)) {
2282 } else {
2283 macro_assembler->CheckCharacter(chars[0], &ok);
2284 macro_assembler->CheckNotCharacter(chars[1], on_failure);
2285 macro_assembler->BindBlock(&ok);
2286 }
2287 break;
2288 }
2289 case 4:
2290 macro_assembler->CheckCharacter(chars[3], &ok);
2291 // Fall through!
2292 case 3:
2293 macro_assembler->CheckCharacter(chars[0], &ok);
2294 macro_assembler->CheckCharacter(chars[1], &ok);
2295 macro_assembler->CheckNotCharacter(chars[2], on_failure);
2296 macro_assembler->BindBlock(&ok);
2297 break;
2298 default:
2299 UNREACHABLE();
2300 break;
2301 }
2302 return true;
2303 }
2304
2305
2306 // Only emits non-letters (things that don't have case). Only used for case
2307 // independent matches.
2308 static inline bool EmitAtomNonLetter(Isolate* isolate,
2309 RegExpCompiler* compiler,
2310 uint16_t c,
2311 BlockLabel* on_failure,
2312 intptr_t cp_offset,
2313 bool check,
2314 bool preloaded) {
2315 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2316 bool ascii = compiler->ascii();
2317 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2318 intptr_t length = GetCaseIndependentLetters(c, ascii, chars);
2319 if (length < 1) {
2320 // This can't match. Must be an ASCII subject and a non-ASCII character.
2321 // We do not need to do anything since the ASCII pass already handled this.
2322 return false; // Bounds not checked.
2323 }
2324 bool checked = false;
2325 // We handle the length > 1 case in a later pass.
2326 if (length == 1) {
2327 if (ascii && c > Symbols::kMaxOneCharCodeSymbol) {
2328 // Can't match - see above.
2329 return false; // Bounds not checked.
2330 }
2331 if (!preloaded) {
2332 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
2333 checked = check;
2334 }
2335 macro_assembler->CheckNotCharacter(c, on_failure);
2336 }
2337 return checked;
2338 }
2339
2340
2341 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
2342 intptr_t border,
2343 BlockLabel* fall_through,
2344 BlockLabel* above_or_equal,
2345 BlockLabel* below) {
2346 if (below != fall_through) {
2347 masm->CheckCharacterLT(border, below);
2348 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
2349 } else {
2350 masm->CheckCharacterGT(border - 1, above_or_equal);
2351 }
2352 }
2353
2354
2355 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
2356 intptr_t first,
2357 intptr_t last,
2358 BlockLabel* fall_through,
2359 BlockLabel* in_range,
2360 BlockLabel* out_of_range) {
2361 if (in_range == fall_through) {
2362 if (first == last) {
2363 masm->CheckNotCharacter(first, out_of_range);
2364 } else {
2365 masm->CheckCharacterNotInRange(first, last, out_of_range);
2366 }
2367 } else {
2368 if (first == last) {
2369 masm->CheckCharacter(first, in_range);
2370 } else {
2371 masm->CheckCharacterInRange(first, last, in_range);
2372 }
2373 if (out_of_range != fall_through) masm->GoTo(out_of_range);
2374 }
2375 }
2376
2377
2378 static void CutOutRange(RegExpMacroAssembler* masm,
2379 ZoneGrowableArray<int>* ranges,
2380 intptr_t start_index,
2381 intptr_t end_index,
2382 intptr_t cut_index,
2383 BlockLabel* even_label,
2384 BlockLabel* odd_label) {
2385 bool odd = (((cut_index - start_index) & 1) == 1);
2386 BlockLabel* in_range_label = odd ? odd_label : even_label;
2387 BlockLabel dummy;
2388 EmitDoubleBoundaryTest(masm,
2389 ranges->At(cut_index),
2390 ranges->At(cut_index + 1) - 1,
2391 &dummy,
2392 in_range_label,
2393 &dummy);
2394 ASSERT(!dummy.IsLinked());
2395 // Cut out the single range by rewriting the array. This creates a new
2396 // range that is a merger of the two ranges on either side of the one we
2397 // are cutting out. The oddity of the labels is preserved.
2398 for (intptr_t j = cut_index; j > start_index; j--) {
2399 (*ranges)[j] = ranges->At(j - 1);
2400 }
2401 for (intptr_t j = cut_index + 1; j < end_index; j++) {
2402 (*ranges)[j] = ranges->At(j + 1);
2403 }
2404 }
2405
2406
2407 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
2408 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
2409 static void EmitUseLookupTable(
2410 RegExpMacroAssembler* masm,
2411 ZoneGrowableArray<int>* ranges,
2412 intptr_t start_index,
2413 intptr_t end_index,
2414 intptr_t min_char,
2415 BlockLabel* fall_through,
2416 BlockLabel* even_label,
2417 BlockLabel* odd_label) {
2418 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;
2419 static const intptr_t kMask = RegExpMacroAssembler::kTableMask;
2420
2421 intptr_t base = (min_char & ~kMask);
2422
2423 // Assert that everything is on one kTableSize page.
2424 for (intptr_t i = start_index; i <= end_index; i++) {
2425 ASSERT((ranges->At(i) & ~kMask) == base);
2426 }
2427 ASSERT(start_index == 0 || (ranges->At(start_index - 1) & ~kMask) <= base);
2428
2429 char templ[kSize];
2430 BlockLabel* on_bit_set;
2431 BlockLabel* on_bit_clear;
2432 intptr_t bit;
2433 if (even_label == fall_through) {
2434 on_bit_set = odd_label;
2435 on_bit_clear = even_label;
2436 bit = 1;
2437 } else {
2438 on_bit_set = even_label;
2439 on_bit_clear = odd_label;
2440 bit = 0;
2441 }
2442 for (intptr_t i = 0; i < (ranges->At(start_index) & kMask) && i < kSize;
2443 i++) {
2444 templ[i] = bit;
2445 }
2446 intptr_t j = 0;
2447 bit ^= 1;
2448 for (intptr_t i = start_index; i < end_index; i++) {
2449 for (j = (ranges->At(i) & kMask); j < (ranges->At(i + 1) & kMask); j++) {
2450 templ[j] = bit;
2451 }
2452 bit ^= 1;
2453 }
2454 for (intptr_t i = j; i < kSize; i++) {
2455 templ[i] = bit;
2456 }
2457 // TODO(erikcorry): Cache these.
2458 const TypedData& ba = TypedData::ZoneHandle(
2459 masm->isolate(),
2460 TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));
2461 for (intptr_t i = 0; i < kSize; i++) {
2462 ba.SetUint8(i, templ[i]);
2463 }
2464 masm->CheckBitInTable(ba, on_bit_set);
2465 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
2466 }
2467
2468
2469 // Unicode case. Split the search space into kSize spaces that are handled
2470 // with recursion.
2471 static void SplitSearchSpace(ZoneGrowableArray<int>* ranges,
2472 intptr_t start_index,
2473 intptr_t end_index,
2474 intptr_t* new_start_index,
2475 intptr_t* new_end_index,
2476 intptr_t* border) {
2477 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;
2478 static const intptr_t kMask = RegExpMacroAssembler::kTableMask;
2479
2480 intptr_t first = ranges->At(start_index);
2481 intptr_t last = ranges->At(end_index) - 1;
2482
2483 *new_start_index = start_index;
2484 *border = (ranges->At(start_index) & ~kMask) + kSize;
2485 while (*new_start_index < end_index) {
2486 if (ranges->At(*new_start_index) > *border) break;
2487 (*new_start_index)++;
2488 }
2489 // new_start_index is the index of the first edge that is beyond the
2490 // current kSize space.
2491
2492 // For very large search spaces we do a binary chop search of the non-ASCII
2493 // space instead of just going to the end of the current kSize space. The
2494 // heuristics are complicated a little by the fact that any 128-character
2495 // encoding space can be quickly tested with a table lookup, so we don't
2496 // wish to do binary chop search at a smaller granularity than that. A
2497 // 128-character space can take up a lot of space in the ranges array if,
2498 // for example, we only want to match every second character (eg. the lower
2499 // case characters on some Unicode pages).
2500 intptr_t binary_chop_index = (end_index + start_index) / 2;
2501 // The first test ensures that we get to the code that handles the ASCII
2502 // range with a single not-taken branch, speeding up this important
2503 // character range (even non-ASCII charset-based text has spaces and
2504 // punctuation).
2505 if (*border - 1 > Symbols::kMaxOneCharCodeSymbol && // ASCII case.
2506 end_index - start_index > (*new_start_index - start_index) * 2 &&
2507 last - first > kSize * 2 &&
2508 binary_chop_index > *new_start_index &&
2509 ranges->At(binary_chop_index) >= first + 2 * kSize) {
2510 intptr_t scan_forward_for_section_border = binary_chop_index;;
2511 intptr_t new_border = (ranges->At(binary_chop_index) | kMask) + 1;
2512
2513 while (scan_forward_for_section_border < end_index) {
2514 if (ranges->At(scan_forward_for_section_border) > new_border) {
2515 *new_start_index = scan_forward_for_section_border;
2516 *border = new_border;
2517 break;
2518 }
2519 scan_forward_for_section_border++;
2520 }
2521 }
2522
2523 ASSERT(*new_start_index > start_index);
2524 *new_end_index = *new_start_index - 1;
2525 if (ranges->At(*new_end_index) == *border) {
2526 (*new_end_index)--;
2527 }
2528 if (*border >= ranges->At(end_index)) {
2529 *border = ranges->At(end_index);
2530 *new_start_index = end_index; // Won't be used.
2531 *new_end_index = end_index - 1;
2532 }
2533 }
2534
2535
2536 // Gets a series of segment boundaries representing a character class. If the
2537 // character is in the range between an even and an odd boundary (counting from
2538 // start_index) then go to even_label, otherwise go to odd_label. We already
2539 // know that the character is in the range of min_char to max_char inclusive.
2540 // Either label can be NULL indicating backtracking. Either label can also be
2541 // equal to the fall_through label.
2542 static void GenerateBranches(RegExpMacroAssembler* masm,
2543 ZoneGrowableArray<int>* ranges,
2544 intptr_t start_index,
2545 intptr_t end_index,
2546 uint16_t min_char,
2547 uint16_t max_char,
2548 BlockLabel* fall_through,
2549 BlockLabel* even_label,
2550 BlockLabel* odd_label) {
2551 intptr_t first = ranges->At(start_index);
2552 intptr_t last = ranges->At(end_index) - 1;
2553
2554 ASSERT(min_char < first);
2555
2556 // Just need to test if the character is before or on-or-after
2557 // a particular character.
2558 if (start_index == end_index) {
2559 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
2560 return;
2561 }
2562
2563 // Another almost trivial case: There is one interval in the middle that is
2564 // different from the end intervals.
2565 if (start_index + 1 == end_index) {
2566 EmitDoubleBoundaryTest(
2567 masm, first, last, fall_through, even_label, odd_label);
2568 return;
2569 }
2570
2571 // It's not worth using table lookup if there are very few intervals in the
2572 // character class.
2573 if (end_index - start_index <= 6) {
2574 // It is faster to test for individual characters, so we look for those
2575 // first, then try arbitrary ranges in the second round.
2576 static intptr_t kNoCutIndex = -1;
2577 intptr_t cut = kNoCutIndex;
2578 for (intptr_t i = start_index; i < end_index; i++) {
2579 if (ranges->At(i) == ranges->At(i + 1) - 1) {
2580 cut = i;
2581 break;
2582 }
2583 }
2584 if (cut == kNoCutIndex) cut = start_index;
2585 CutOutRange(
2586 masm, ranges, start_index, end_index, cut, even_label, odd_label);
2587 ASSERT(end_index - start_index >= 2);
2588 GenerateBranches(masm,
2589 ranges,
2590 start_index + 1,
2591 end_index - 1,
2592 min_char,
2593 max_char,
2594 fall_through,
2595 even_label,
2596 odd_label);
2597 return;
2598 }
2599
2600 // If there are a lot of intervals in the regexp, then we will use tables to
2601 // determine whether the character is inside or outside the character class.
2602 static const intptr_t kBits = RegExpMacroAssembler::kTableSizeBits;
2603
2604 if ((max_char >> kBits) == (min_char >> kBits)) {
2605 EmitUseLookupTable(masm,
2606 ranges,
2607 start_index,
2608 end_index,
2609 min_char,
2610 fall_through,
2611 even_label,
2612 odd_label);
2613 return;
2614 }
2615
2616 if ((min_char >> kBits) != (first >> kBits)) {
2617 masm->CheckCharacterLT(first, odd_label);
2618 GenerateBranches(masm,
2619 ranges,
2620 start_index + 1,
2621 end_index,
2622 first,
2623 max_char,
2624 fall_through,
2625 odd_label,
2626 even_label);
2627 return;
2628 }
2629
2630 intptr_t new_start_index = 0;
2631 intptr_t new_end_index = 0;
2632 intptr_t border = 0;
2633
2634 SplitSearchSpace(ranges,
2635 start_index,
2636 end_index,
2637 &new_start_index,
2638 &new_end_index,
2639 &border);
2640
2641 BlockLabel handle_rest;
2642 BlockLabel* above = &handle_rest;
2643 if (border == last + 1) {
2644 // We didn't find any section that started after the limit, so everything
2645 // above the border is one of the terminal labels.
2646 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2647 ASSERT(new_end_index == end_index - 1);
2648 }
2649
2650 ASSERT(start_index <= new_end_index);
2651 ASSERT(new_start_index <= end_index);
2652 ASSERT(start_index < new_start_index);
2653 ASSERT(new_end_index < end_index);
2654 ASSERT(new_end_index + 1 == new_start_index ||
2655 (new_end_index + 2 == new_start_index &&
2656 border == ranges->At(new_end_index + 1)));
2657 ASSERT(min_char < border - 1);
2658 ASSERT(border < max_char);
2659 ASSERT(ranges->At(new_end_index) < border);
2660 ASSERT(border < ranges->At(new_start_index) ||
2661 (border == ranges->At(new_start_index) &&
2662 new_start_index == end_index &&
2663 new_end_index == end_index - 1 &&
2664 border == last + 1));
2665 ASSERT(new_start_index == 0 || border >= ranges->At(new_start_index - 1));
2666
2667 masm->CheckCharacterGT(border - 1, above);
2668 BlockLabel dummy;
2669 GenerateBranches(masm,
2670 ranges,
2671 start_index,
2672 new_end_index,
2673 min_char,
2674 border - 1,
2675 &dummy,
2676 even_label,
2677 odd_label);
2678
2679 if (handle_rest.IsLinked()) {
2680 masm->BindBlock(&handle_rest);
2681 bool flip = (new_start_index & 1) != (start_index & 1);
2682 GenerateBranches(masm,
2683 ranges,
2684 new_start_index,
2685 end_index,
2686 border,
2687 max_char,
2688 &dummy,
2689 flip ? odd_label : even_label,
2690 flip ? even_label : odd_label);
2691 }
2692 }
2693
2694
2695 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2696 RegExpCharacterClass* cc,
2697 bool ascii,
2698 BlockLabel* on_failure,
2699 intptr_t cp_offset,
2700 bool check_offset,
2701 bool preloaded,
2702 Isolate* isolate) {
2703 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
2704 if (!CharacterRange::IsCanonical(ranges)) {
2705 CharacterRange::Canonicalize(ranges);
2706 }
2707
2708 intptr_t max_char;
2709 if (ascii) {
2710 max_char = Symbols::kMaxOneCharCodeSymbol;
2711 } else {
2712 max_char = Utf16::kMaxCodeUnit;
2713 }
2714
2715 intptr_t range_count = ranges->length();
2716
2717 intptr_t last_valid_range = range_count - 1;
2718 while (last_valid_range >= 0) {
2719 CharacterRange& range = (*ranges)[last_valid_range];
2720 if (range.from() <= max_char) {
2721 break;
2722 }
2723 last_valid_range--;
2724 }
2725
2726 if (last_valid_range < 0) {
2727 if (!cc->is_negated()) {
2728 macro_assembler->GoTo(on_failure);
2729 }
2730 if (check_offset) {
2731 macro_assembler->CheckPosition(cp_offset, on_failure);
2732 }
2733 return;
2734 }
2735
2736 if (last_valid_range == 0 &&
2737 ranges->At(0).IsEverything(max_char)) {
2738 if (cc->is_negated()) {
2739 macro_assembler->GoTo(on_failure);
2740 } else {
2741 // This is a common case hit by non-anchored expressions.
2742 if (check_offset) {
2743 macro_assembler->CheckPosition(cp_offset, on_failure);
2744 }
2745 }
2746 return;
2747 }
2748 if (last_valid_range == 0 &&
2749 !cc->is_negated() &&
2750 ranges->At(0).IsEverything(max_char)) {
2751 // This is a common case hit by non-anchored expressions.
2752 if (check_offset) {
2753 macro_assembler->CheckPosition(cp_offset, on_failure);
2754 }
2755 return;
2756 }
2757
2758 if (!preloaded) {
2759 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2760 }
2761
2762 if (cc->is_standard() &&
2763 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2764 on_failure)) {
2765 return;
2766 }
2767
2768
2769 // A new list with ascending entries. Each entry is a code unit
2770 // where there is a boundary between code units that are part of
2771 // the class and code units that are not. Normally we insert an
2772 // entry at zero which goes to the failure label, but if there
2773 // was already one there we fall through for success on that entry.
2774 // Subsequent entries have alternating meaning (success/failure).
2775 ZoneGrowableArray<int>* range_boundaries =
2776 new(isolate) ZoneGrowableArray<int>(last_valid_range);
2777
2778 bool zeroth_entry_is_failure = !cc->is_negated();
2779
2780 for (intptr_t i = 0; i <= last_valid_range; i++) {
2781 CharacterRange& range = (*ranges)[i];
2782 if (range.from() == 0) {
2783 ASSERT(i == 0);
2784 zeroth_entry_is_failure = !zeroth_entry_is_failure;
2785 } else {
2786 range_boundaries->Add(range.from());
2787 }
2788 range_boundaries->Add(range.to() + 1);
2789 }
2790 intptr_t end_index = range_boundaries->length() - 1;
2791 if (range_boundaries->At(end_index) > max_char) {
2792 end_index--;
2793 }
2794
2795 BlockLabel fall_through;
2796 GenerateBranches(macro_assembler,
2797 range_boundaries,
2798 0, // start_index.
2799 end_index,
2800 0, // min_char.
2801 max_char,
2802 &fall_through,
2803 zeroth_entry_is_failure ? &fall_through : on_failure,
2804 zeroth_entry_is_failure ? on_failure : &fall_through);
2805 macro_assembler->BindBlock(&fall_through);
2806 }
2807
2808
2809 // We call this repeatedly to generate code for each pass over the text node.
2810 // The passes are in increasing order of difficulty because we hope one
2811 // of the first passes will fail in which case we are saved the work of the
2812 // later passes. for example for the case independent regexp /%[asdfghjkl]a/
2813 // we will check the '%' in the first pass, the case independent 'a' in the
2814 // second pass and the character class in the last pass.
2815 //
2816 // The passes are done from right to left, so for example to test for /bar/
2817 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
2818 // and then a 'b' with offset 0. This means we can avoid the end-of-input
2819 // bounds check most of the time. In the example we only need to check for
2820 // end-of-input when loading the putative 'r'.
2821 //
2822 // A slight complication involves the fact that the first character may already
2823 // be fetched into a register by the previous node. In this case we want to
2824 // do the test for that character first. We do this in separate passes. The
2825 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2826 // pass has been performed then subsequent passes will have true in
2827 // first_element_checked to indicate that that character does not need to be
2828 // checked again.
2829 //
2830 // In addition to all this we are passed a Trace, which can
2831 // contain an AlternativeGeneration object. In this AlternativeGeneration
2832 // object we can see details of any quick check that was already passed in
2833 // order to get to the code we are now generating. The quick check can involve
2834 // loading characters, which means we do not need to recheck the bounds
2835 // up to the limit the quick check already checked. In addition the quick
2836 // check can have involved a mask and compare operation which may simplify
2837 // or obviate the need for further checks at some character positions.
2838 void TextNode::TextEmitPass(RegExpCompiler* compiler,
2839 TextEmitPassType pass,
2840 bool preloaded,
2841 Trace* trace,
2842 bool first_element_checked,
2843 intptr_t* checked_up_to) {
2844 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2845 bool ascii = compiler->ascii();
2846 BlockLabel* backtrack = trace->backtrack();
2847 QuickCheckDetails* quick_check = trace->quick_check_performed();
2848 intptr_t element_count = elms_->length();
2849 for (intptr_t i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2850 TextElement elm = elms_->At(i);
2851 intptr_t cp_offset = trace->cp_offset() + elm.cp_offset();
2852 if (elm.text_type() == TextElement::ATOM) {
2853 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
2854 for (intptr_t j = preloaded ? 0 : quarks->length() - 1; j >= 0; j--) {
2855 if (first_element_checked && i == 0 && j == 0) continue;
2856 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
2857 EmitCharacterFunction* emit_function = NULL;
2858 switch (pass) {
2859 case NON_ASCII_MATCH:
2860 ASSERT(ascii);
2861 if (quarks->At(j) > Symbols::kMaxOneCharCodeSymbol) {
2862 assembler->GoTo(backtrack);
2863 return;
2864 }
2865 break;
2866 case NON_LETTER_CHARACTER_MATCH:
2867 emit_function = &EmitAtomNonLetter;
2868 break;
2869 case SIMPLE_CHARACTER_MATCH:
2870 emit_function = &EmitSimpleCharacter;
2871 break;
2872 case CASE_CHARACTER_MATCH:
2873 emit_function = &EmitAtomLetter;
2874 break;
2875 default:
2876 break;
2877 }
2878 if (emit_function != NULL) {
2879 bool bound_checked = emit_function(I,
2880 compiler,
2881 quarks->At(j),
2882 backtrack,
2883 cp_offset + j,
2884 *checked_up_to < cp_offset + j,
2885 preloaded);
2886 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2887 }
2888 }
2889 } else {
2890 ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
2891 if (pass == CHARACTER_CLASS_MATCH) {
2892 if (first_element_checked && i == 0) continue;
2893 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
2894 RegExpCharacterClass* cc = elm.char_class();
2895 EmitCharClass(assembler,
2896 cc,
2897 ascii,
2898 backtrack,
2899 cp_offset,
2900 *checked_up_to < cp_offset,
2901 preloaded,
2902 I);
2903 UpdateBoundsCheck(cp_offset, checked_up_to);
2904 }
2905 }
2906 }
2907 }
2908
2909
2910 intptr_t ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
2911 intptr_t eats_at_least) {
2912 intptr_t preload_characters = Utils::Minimum(static_cast<intptr_t>(4),
2913 eats_at_least);
2914 if (compiler->macro_assembler()->CanReadUnaligned()) {
2915 bool ascii = compiler->ascii();
2916 if (ascii) {
2917 if (preload_characters > 4) preload_characters = 4;
2918 // We can't preload 3 characters because there is no machine instruction
2919 // to do that. We can't just load 4 because we could be reading
2920 // beyond the end of the string, which could cause a memory fault.
2921 if (preload_characters == 3) preload_characters = 2;
2922 } else {
2923 if (preload_characters > 2) preload_characters = 2;
2924 }
2925 } else {
2926 if (preload_characters > 1) preload_characters = 1;
2927 }
2928 return preload_characters;
2929 }
2930
2931
2932 intptr_t TextNode::EatsAtLeast(intptr_t still_to_find,
2933 intptr_t budget,
2934 bool not_at_start) {
2935 intptr_t answer = Length();
2936 if (answer >= still_to_find) return answer;
2937 if (budget <= 0) return answer;
2938 // We are not at start after this node so we set the last argument to 'true'.
2939 return answer + on_success()->EatsAtLeast(still_to_find - answer,
2940 budget - 1,
2941 true);
2942 }
2943
2944
2945 intptr_t ActionNode::EatsAtLeast(intptr_t still_to_find,
2946 intptr_t budget,
2947 bool not_at_start) {
2948 if (budget <= 0) return 0;
2949 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2950 return on_success()->EatsAtLeast(still_to_find,
2951 budget - 1,
2952 not_at_start);
2953 }
2954
2955
2956 intptr_t AssertionNode::EatsAtLeast(intptr_t still_to_find,
2957 intptr_t budget,
2958 bool not_at_start) {
2959 if (budget <= 0) return 0;
2960 // If we know we are not at the start and we are asked "how many characters
2961 // will you match if you succeed?" then we can answer anything since false
2962 // implies false. So lets just return the max answer (still_to_find) since
2963 // that won't prevent us from preloading a lot of characters for the other
2964 // branches in the node graph.
2965 if (assertion_type() == AT_START && not_at_start) return still_to_find;
2966 return on_success()->EatsAtLeast(still_to_find,
2967 budget - 1,
2968 not_at_start);
2969 }
2970
2971
2972 intptr_t BackReferenceNode::EatsAtLeast(intptr_t still_to_find,
2973 intptr_t budget,
2974 bool not_at_start) {
2975 if (budget <= 0) return 0;
2976 return on_success()->EatsAtLeast(still_to_find,
2977 budget - 1,
2978 not_at_start);
2979 }
2980
2981
2982 intptr_t ChoiceNode::EatsAtLeastHelper(intptr_t still_to_find,
2983 intptr_t budget,
2984 RegExpNode* ignore_this_node,
2985 bool not_at_start) {
2986 if (budget <= 0) return 0;
2987 intptr_t min = 100;
2988 intptr_t choice_count = alternatives_->length();
2989 budget = (budget - 1) / choice_count;
2990 for (intptr_t i = 0; i < choice_count; i++) {
2991 RegExpNode* node = (*alternatives_)[i].node();
2992 if (node == ignore_this_node) continue;
2993 intptr_t node_eats_at_least =
2994 node->EatsAtLeast(still_to_find, budget, not_at_start);
2995 if (node_eats_at_least < min) min = node_eats_at_least;
2996 if (min == 0) return 0;
2997 }
2998 return min;
2999 }
3000
3001
3002 intptr_t ChoiceNode::EatsAtLeast(intptr_t still_to_find,
3003 intptr_t budget,
3004 bool not_at_start) {
3005 return EatsAtLeastHelper(still_to_find,
3006 budget,
3007 NULL,
3008 not_at_start);
3009 }
3010
3011
3012 intptr_t NegativeLookaheadChoiceNode::EatsAtLeast(intptr_t still_to_find,
3013 intptr_t budget,
3014 bool not_at_start) {
3015 if (budget <= 0) return 0;
3016 // Alternative 0 is the negative lookahead, alternative 1 is what comes
3017 // afterwards.
3018 RegExpNode* node = (*alternatives_)[1].node();
3019 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
3020 }
3021
3022
3023 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
3024 static inline uint32_t SmearBitsRight(uint32_t v) {
3025 v |= v >> 1;
3026 v |= v >> 2;
3027 v |= v >> 4;
3028 v |= v >> 8;
3029 v |= v >> 16;
3030 return v;
3031 }
3032
3033
3034 // Here is the meat of GetQuickCheckDetails (see also the comment on the
3035 // super-class in the .h file).
3036 //
3037 // We iterate along the text object, building up for each character a
3038 // mask and value that can be used to test for a quick failure to match.
3039 // The masks and values for the positions will be combined into a single
3040 // machine word for the current character width in order to be used in
3041 // generating a quick check.
3042 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
3043 RegExpCompiler* compiler,
3044 intptr_t characters_filled_in,
3045 bool not_at_start) {
3046 ASSERT(characters_filled_in < details->characters());
3047 intptr_t characters = details->characters();
3048 intptr_t char_mask;
3049 if (compiler->ascii()) {
3050 char_mask = Symbols::kMaxOneCharCodeSymbol;
3051 } else {
3052 char_mask = Utf16::kMaxCodeUnit;
3053 }
3054 for (intptr_t k = 0; k < elms_->length(); k++) {
3055 TextElement elm = elms_->At(k);
3056 if (elm.text_type() == TextElement::ATOM) {
3057 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
3058 for (intptr_t i = 0; i < characters && i < quarks->length(); i++) {
3059 QuickCheckDetails::Position* pos =
3060 details->positions(characters_filled_in);
3061 uint16_t c = quarks->At(i);
3062 if (c > char_mask) {
3063 // If we expect a non-ASCII character from an ASCII string,
3064 // there is no way we can match. Not even case independent
3065 // matching can turn an ASCII character into non-ASCII or
3066 // vice versa.
3067 details->set_cannot_match();
3068 pos->determines_perfectly = false;
3069 return;
3070 }
3071 if (compiler->ignore_case()) {
3072 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3073 intptr_t length =
3074 GetCaseIndependentLetters(c, compiler->ascii(), chars);
3075 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
3076 if (length == 1) {
3077 // This letter has no case equivalents, so it's nice and simple
3078 // and the mask-compare will determine definitely whether we have
3079 // a match at this character position.
3080 pos->mask = char_mask;
3081 pos->value = c;
3082 pos->determines_perfectly = true;
3083 } else {
3084 uint32_t common_bits = char_mask;
3085 uint32_t bits = chars[0];
3086 for (intptr_t j = 1; j < length; j++) {
3087 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
3088 common_bits ^= differing_bits;
3089 bits &= common_bits;
3090 }
3091 // If length is 2 and common bits has only one zero in it then
3092 // our mask and compare instruction will determine definitely
3093 // whether we have a match at this character position. Otherwise
3094 // it can only be an approximate check.
3095 uint32_t one_zero = (common_bits | ~char_mask);
3096 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
3097 pos->determines_perfectly = true;
3098 }
3099 pos->mask = common_bits;
3100 pos->value = bits;
3101 }
3102 } else {
3103 // Don't ignore case. Nice simple case where the mask-compare will
3104 // determine definitely whether we have a match at this character
3105 // position.
3106 pos->mask = char_mask;
3107 pos->value = c;
3108 pos->determines_perfectly = true;
3109 }
3110 characters_filled_in++;
3111 ASSERT(characters_filled_in <= details->characters());
3112 if (characters_filled_in == details->characters()) {
3113 return;
3114 }
3115 }
3116 } else {
3117 QuickCheckDetails::Position* pos =
3118 details->positions(characters_filled_in);
3119 RegExpCharacterClass* tree = elm.char_class();
3120 ZoneGrowableArray<CharacterRange>* ranges = tree->ranges();
3121 if (tree->is_negated()) {
3122 // A quick check uses multi-character mask and compare. There is no
3123 // useful way to incorporate a negative char class into this scheme
3124 // so we just conservatively create a mask and value that will always
3125 // succeed.
3126 pos->mask = 0;
3127 pos->value = 0;
3128 } else {
3129 intptr_t first_range = 0;
3130 while (ranges->At(first_range).from() > char_mask) {
3131 first_range++;
3132 if (first_range == ranges->length()) {
3133 details->set_cannot_match();
3134 pos->determines_perfectly = false;
3135 return;
3136 }
3137 }
3138 CharacterRange range = ranges->At(first_range);
3139 uint16_t from = range.from();
3140 uint16_t to = range.to();
3141 if (to > char_mask) {
3142 to = char_mask;
3143 }
3144 uint32_t differing_bits = (from ^ to);
3145 // A mask and compare is only perfect if the differing bits form a
3146 // number like 00011111 with one single block of trailing 1s.
3147 if ((differing_bits & (differing_bits + 1)) == 0 &&
3148 from + differing_bits == to) {
3149 pos->determines_perfectly = true;
3150 }
3151 uint32_t common_bits = ~SmearBitsRight(differing_bits);
3152 uint32_t bits = (from & common_bits);
3153 for (intptr_t i = first_range + 1; i < ranges->length(); i++) {
3154 CharacterRange range = ranges->At(i);
3155 uint16_t from = range.from();
3156 uint16_t to = range.to();
3157 if (from > char_mask) continue;
3158 if (to > char_mask) to = char_mask;
3159 // Here we are combining more ranges into the mask and compare
3160 // value. With each new range the mask becomes more sparse and
3161 // so the chances of a false positive rise. A character class
3162 // with multiple ranges is assumed never to be equivalent to a
3163 // mask and compare operation.
3164 pos->determines_perfectly = false;
3165 uint32_t new_common_bits = (from ^ to);
3166 new_common_bits = ~SmearBitsRight(new_common_bits);
3167 common_bits &= new_common_bits;
3168 bits &= new_common_bits;
3169 uint32_t differing_bits = (from & common_bits) ^ bits;
3170 common_bits ^= differing_bits;
3171 bits &= common_bits;
3172 }
3173 pos->mask = common_bits;
3174 pos->value = bits;
3175 }
3176 characters_filled_in++;
3177 ASSERT(characters_filled_in <= details->characters());
3178 if (characters_filled_in == details->characters()) {
3179 return;
3180 }
3181 }
3182 }
3183 ASSERT(characters_filled_in != details->characters());
3184 if (!details->cannot_match()) {
3185 on_success()-> GetQuickCheckDetails(details,
3186 compiler,
3187 characters_filled_in,
3188 true);
3189 }
3190 }
3191
3192
3193 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
3194 RegExpCompiler* compiler,
3195 intptr_t characters_filled_in,
3196 bool not_at_start) {
3197 if (body_can_be_zero_length_ || info()->visited) return;
3198 VisitMarker marker(info());
3199 return ChoiceNode::GetQuickCheckDetails(details,
3200 compiler,
3201 characters_filled_in,
3202 not_at_start);
3203 }
3204
3205
3206 intptr_t TextNode::Length() {
3207 TextElement elm = elms_->Last();
3208 ASSERT(elm.cp_offset() >= 0);
3209 return elm.cp_offset() + elm.length();
3210 }
3211
3212
3213 bool TextNode::SkipPass(intptr_t intptr_t_pass, bool ignore_case) {
3214 TextEmitPassType pass = static_cast<TextEmitPassType>(intptr_t_pass);
3215 if (ignore_case) {
3216 return pass == SIMPLE_CHARACTER_MATCH;
3217 } else {
3218 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3219 }
3220 }
3221
3222
3223 static bool CompareInverseRanges(ZoneGrowableArray<CharacterRange>* ranges,
3224 const intptr_t* special_class,
3225 intptr_t length) {
3226 length--; // Remove final 0x10000.
3227 ASSERT(special_class[length] == 0x10000);
3228 ASSERT(ranges->length() != 0);
3229 ASSERT(length != 0);
3230 ASSERT(special_class[0] != 0);
3231 if (ranges->length() != (length >> 1) + 1) {
3232 return false;
3233 }
3234 CharacterRange range = ranges->At(0);
3235 if (range.from() != 0) {
3236 return false;
3237 }
3238 for (intptr_t i = 0; i < length; i += 2) {
3239 if (special_class[i] != (range.to() + 1)) {
3240 return false;
3241 }
3242 range = ranges->At((i >> 1) + 1);
3243 if (special_class[i+1] != range.from()) {
3244 return false;
3245 }
3246 }
3247 if (range.to() != 0xffff) {
3248 return false;
3249 }
3250 return true;
3251 }
3252
3253
3254 static bool CompareRanges(ZoneGrowableArray<CharacterRange>* ranges,
3255 const intptr_t* special_class,
3256 intptr_t length) {
3257 length--; // Remove final 0x10000.
3258 ASSERT(special_class[length] == 0x10000);
3259 if (ranges->length() * 2 != length) {
3260 return false;
3261 }
3262 for (intptr_t i = 0; i < length; i += 2) {
3263 CharacterRange range = ranges->At(i >> 1);
3264 if (range.from() != special_class[i] ||
3265 range.to() != special_class[i + 1] - 1) {
3266 return false;
3267 }
3268 }
3269 return true;
3270 }
3271
3272
3273 bool RegExpCharacterClass::is_standard() {
3274 // TODO(lrn): Remove need for this function, by not throwing away information
3275 // along the way.
3276 if (is_negated_) {
3277 return false;
3278 }
3279 if (set_.is_standard()) {
3280 return true;
3281 }
3282 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3283 set_.set_standard_set_type('s');
3284 return true;
3285 }
3286 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3287 set_.set_standard_set_type('S');
3288 return true;
3289 }
3290 if (CompareInverseRanges(set_.ranges(),
3291 kLineTerminatorRanges,
3292 kLineTerminatorRangeCount)) {
3293 set_.set_standard_set_type('.');
3294 return true;
3295 }
3296 if (CompareRanges(set_.ranges(),
3297 kLineTerminatorRanges,
3298 kLineTerminatorRangeCount)) {
3299 set_.set_standard_set_type('n');
3300 return true;
3301 }
3302 if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
3303 set_.set_standard_set_type('w');
3304 return true;
3305 }
3306 if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
3307 set_.set_standard_set_type('W');
3308 return true;
3309 }
3310 return false;
3311 }
3312
3313
3314 void TextNode::MakeCaseIndependent(bool is_ascii) {
3315 intptr_t element_count = elms_->length();
3316 for (intptr_t i = 0; i < element_count; i++) {
3317 TextElement elm = elms_->At(i);
3318 if (elm.text_type() == TextElement::CHAR_CLASS) {
3319 RegExpCharacterClass* cc = elm.char_class();
3320 // None of the standard character classes is different in the case
3321 // independent case and it slows us down if we don't know that.
3322 if (cc->is_standard()) continue;
3323 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
3324 intptr_t range_count = ranges->length();
3325 for (intptr_t j = 0; j < range_count; j++) {
3326 (*ranges)[j].AddCaseEquivalents(ranges, is_ascii, I);
3327 }
3328 }
3329 }
3330 }
3331
3332
3333 intptr_t TextNode::GreedyLoopTextLength() {
3334 TextElement elm = elms_->At(elms_->length() - 1);
3335 return elm.cp_offset() + elm.length();
3336 }
3337
3338
3339 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3340 RegExpCompiler* compiler) {
3341 if (elms_->length() != 1) return NULL;
3342 TextElement elm = elms_->At(0);
3343 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3344 RegExpCharacterClass* node = elm.char_class();
3345 ZoneGrowableArray<CharacterRange>* ranges = node->ranges();
3346 if (!CharacterRange::IsCanonical(ranges)) {
3347 CharacterRange::Canonicalize(ranges);
3348 }
3349 if (node->is_negated()) {
3350 return ranges->length() == 0 ? on_success() : NULL;
3351 }
3352 if (ranges->length() != 1) return NULL;
3353 uint32_t max_char;
3354 if (compiler->ascii()) {
3355 max_char = Symbols::kMaxOneCharCodeSymbol;
3356 } else {
3357 max_char = Utf16::kMaxCodeUnit;
3358 }
3359 return ranges->At(0).IsEverything(max_char) ? on_success() : NULL;
3360 }
3361
3362
3363 ActionNode* ActionNode::SetRegister(intptr_t reg,
3364 intptr_t val,
3365 RegExpNode* on_success) {
3366 ActionNode* result =
3367 new(on_success->isolate()) ActionNode(SET_REGISTER, on_success);
3368 result->data_.u_store_register.reg = reg;
3369 result->data_.u_store_register.value = val;
3370 return result;
3371 }
3372
3373
3374 ActionNode* ActionNode::IncrementRegister(intptr_t reg,
3375 RegExpNode* on_success) {
3376 ActionNode* result =
3377 new(on_success->isolate()) ActionNode(INCREMENT_REGISTER, on_success);
3378 result->data_.u_increment_register.reg = reg;
3379 return result;
3380 }
3381
3382
3383 ActionNode* ActionNode::StorePosition(intptr_t reg,
3384 bool is_capture,
3385 RegExpNode* on_success) {
3386 ActionNode* result =
3387 new(on_success->isolate()) ActionNode(STORE_POSITION, on_success);
3388 result->data_.u_position_register.reg = reg;
3389 result->data_.u_position_register.is_capture = is_capture;
3390 return result;
3391 }
3392
3393
3394 ActionNode* ActionNode::ClearCaptures(Interval range,
3395 RegExpNode* on_success) {
3396 ActionNode* result =
3397 new(on_success->isolate()) ActionNode(CLEAR_CAPTURES, on_success);
3398 result->data_.u_clear_captures.range_from = range.from();
3399 result->data_.u_clear_captures.range_to = range.to();
3400 return result;
3401 }
3402
3403
3404 ActionNode* ActionNode::BeginSubmatch(intptr_t stack_reg,
3405 intptr_t position_reg,
3406 RegExpNode* on_success) {
3407 ActionNode* result =
3408 new(on_success->isolate()) ActionNode(BEGIN_SUBMATCH, on_success);
3409 result->data_.u_submatch.stack_pointer_register = stack_reg;
3410 result->data_.u_submatch.current_position_register = position_reg;
3411 return result;
3412 }
3413
3414
3415 ActionNode* ActionNode::PositiveSubmatchSuccess(intptr_t stack_reg,
3416 intptr_t position_reg,
3417 intptr_t clear_register_count,
3418 intptr_t clear_register_from,
3419 RegExpNode* on_success) {
3420 ActionNode* result =
3421 new(on_success->isolate()) ActionNode(POSITIVE_SUBMATCH_SUCCESS,
3422 on_success);
3423 result->data_.u_submatch.stack_pointer_register = stack_reg;
3424 result->data_.u_submatch.current_position_register = position_reg;
3425 result->data_.u_submatch.clear_register_count = clear_register_count;
3426 result->data_.u_submatch.clear_register_from = clear_register_from;
3427 return result;
3428 }
3429
3430
3431 ActionNode* ActionNode::EmptyMatchCheck(intptr_t start_register,
3432 intptr_t repetition_register,
3433 intptr_t repetition_limit,
3434 RegExpNode* on_success) {
3435 ActionNode* result =
3436 new(on_success->isolate()) ActionNode(EMPTY_MATCH_CHECK, on_success);
3437 result->data_.u_empty_match_check.start_register = start_register;
3438 result->data_.u_empty_match_check.repetition_register = repetition_register;
3439 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
3440 return result;
3441 }
3442
3443
3444 TextElement TextElement::Atom(RegExpAtom* atom) {
3445 return TextElement(ATOM, atom);
3446 }
3447
3448
3449 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
3450 return TextElement(CHAR_CLASS, char_class);
3451 }
3452
3453
3454 intptr_t TextElement::length() const {
3455 switch (text_type()) {
3456 case ATOM:
3457 return atom()->length();
3458
3459 case CHAR_CLASS:
3460 return 1;
3461 }
3462 UNREACHABLE();
3463 return 0;
3464 }
3465
3466
3467 static void AddClass(const intptr_t* elmv,
3468 intptr_t elmc,
3469 ZoneGrowableArray<CharacterRange>* ranges) {
3470 elmc--;
3471 ASSERT(elmv[elmc] == 0x10000);
3472 for (intptr_t i = 0; i < elmc; i += 2) {
3473 ASSERT(elmv[i] < elmv[i + 1]);
3474 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1));
3475 }
3476 }
3477
3478
3479 static void AddClassNegated(const intptr_t *elmv,
3480 intptr_t elmc,
3481 ZoneGrowableArray<CharacterRange>* ranges) {
3482 elmc--;
3483 ASSERT(elmv[elmc] == 0x10000);
3484 ASSERT(elmv[0] != 0x0000);
3485 ASSERT(elmv[elmc-1] != Utf16::kMaxCodeUnit);
3486 uint16_t last = 0x0000;
3487 for (intptr_t i = 0; i < elmc; i += 2) {
3488 ASSERT(last <= elmv[i] - 1);
3489 ASSERT(elmv[i] < elmv[i + 1]);
3490 ranges->Add(CharacterRange(last, elmv[i] - 1));
3491 last = elmv[i + 1];
3492 }
3493 ranges->Add(CharacterRange(last, Utf16::kMaxCodeUnit));
3494 }
3495
3496
3497 void CharacterRange::AddClassEscape(uint16_t type,
3498 ZoneGrowableArray<CharacterRange>* ranges) {
3499 switch (type) {
3500 case 's':
3501 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
3502 break;
3503 case 'S':
3504 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
3505 break;
3506 case 'w':
3507 AddClass(kWordRanges, kWordRangeCount, ranges);
3508 break;
3509 case 'W':
3510 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
3511 break;
3512 case 'd':
3513 AddClass(kDigitRanges, kDigitRangeCount, ranges);
3514 break;
3515 case 'D':
3516 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
3517 break;
3518 case '.':
3519 AddClassNegated(kLineTerminatorRanges,
3520 kLineTerminatorRangeCount,
3521 ranges);
3522 break;
3523 // This is not a character range as defined by the spec but a
3524 // convenient shorthand for a character class that matches any
3525 // character.
3526 case '*':
3527 ranges->Add(CharacterRange::Everything());
3528 break;
3529 // This is the set of characters matched by the $ and ^ symbols
3530 // in multiline mode.
3531 case 'n':
3532 AddClass(kLineTerminatorRanges,
3533 kLineTerminatorRangeCount,
3534 ranges);
3535 break;
3536 default:
3537 UNREACHABLE();
3538 }
3539 }
3540
3541
3542 // Move a number of elements in a zonelist to another position
3543 // in the same list. Handles overlapping source and target areas.
3544 static void MoveRanges(ZoneGrowableArray<CharacterRange>* list,
3545 intptr_t from,
3546 intptr_t to,
3547 intptr_t count) {
3548 // Ranges are potentially overlapping.
3549 if (from < to) {
3550 for (intptr_t i = count - 1; i >= 0; i--) {
3551 (*list)[to + i] = list->At(from + i);
3552 }
3553 } else {
3554 for (intptr_t i = 0; i < count; i++) {
3555 (*list)[to + i] = list->At(from + i);
3556 }
3557 }
3558 }
3559
3560
3561 static intptr_t InsertRangeInCanonicalList(
3562 ZoneGrowableArray<CharacterRange>* list,
3563 intptr_t count,
3564 CharacterRange insert) {
3565 // Inserts a range into list[0..count[, which must be sorted
3566 // by from value and non-overlapping and non-adjacent, using at most
3567 // list[0..count] for the result. Returns the number of resulting
3568 // canonicalized ranges. Inserting a range may collapse existing ranges into
3569 // fewer ranges, so the return value can be anything in the range 1..count+1.
3570 uint16_t from = insert.from();
3571 uint16_t to = insert.to();
3572 intptr_t start_pos = 0;
3573 intptr_t end_pos = count;
3574 for (intptr_t i = count - 1; i >= 0; i--) {
3575 CharacterRange current = list->At(i);
3576 if (current.from() > to + 1) {
3577 end_pos = i;
3578 } else if (current.to() + 1 < from) {
3579 start_pos = i + 1;
3580 break;
3581 }
3582 }
3583
3584 // Inserted range overlaps, or is adjacent to, ranges at positions
3585 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
3586 // not affected by the insertion.
3587 // If start_pos == end_pos, the range must be inserted before start_pos.
3588 // if start_pos < end_pos, the entire range from start_pos to end_pos
3589 // must be merged with the insert range.
3590
3591 if (start_pos == end_pos) {
3592 // Insert between existing ranges at position start_pos.
3593 if (start_pos < count) {
3594 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
3595 }
3596 (*list)[start_pos] = insert;
3597 return count + 1;
3598 }
3599 if (start_pos + 1 == end_pos) {
3600 // Replace single existing range at position start_pos.
3601 CharacterRange to_replace = list->At(start_pos);
3602 intptr_t new_from = Utils::Minimum(to_replace.from(), from);
3603 intptr_t new_to = Utils::Maximum(to_replace.to(), to);
3604 (*list)[start_pos] = CharacterRange(new_from, new_to);
3605 return count;
3606 }
3607 // Replace a number of existing ranges from start_pos to end_pos - 1.
3608 // Move the remaining ranges down.
3609
3610 intptr_t new_from = Utils::Minimum(list->At(start_pos).from(), from);
3611 intptr_t new_to = Utils::Maximum(list->At(end_pos - 1).to(), to);
3612 if (end_pos < count) {
3613 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
3614 }
3615 (*list)[start_pos] = CharacterRange(new_from, new_to);
3616 return count - (end_pos - start_pos) + 1;
3617 }
3618
3619
3620 bool CharacterRange::IsCanonical(ZoneGrowableArray<CharacterRange>* ranges) {
3621 ASSERT(ranges != NULL);
3622 intptr_t n = ranges->length();
3623 if (n <= 1) return true;
3624 intptr_t max = ranges->At(0).to();
3625 for (intptr_t i = 1; i < n; i++) {
3626 CharacterRange next_range = ranges->At(i);
3627 if (next_range.from() <= max + 1) return false;
3628 max = next_range.to();
3629 }
3630 return true;
3631 }
3632
3633
3634 void CharacterRange::Canonicalize(
3635 ZoneGrowableArray<CharacterRange>* character_ranges) {
3636 if (character_ranges->length() <= 1) return;
3637 // Check whether ranges are already canonical (increasing, non-overlapping,
3638 // non-adjacent).
3639 intptr_t n = character_ranges->length();
3640 intptr_t max = character_ranges->At(0).to();
3641 intptr_t i = 1;
3642 while (i < n) {
3643 CharacterRange current = character_ranges->At(i);
3644 if (current.from() <= max + 1) {
3645 break;
3646 }
3647 max = current.to();
3648 i++;
3649 }
3650 // Canonical until the i'th range. If that's all of them, we are done.
3651 if (i == n) return;
3652
3653 // The ranges at index i and forward are not canonicalized. Make them so by
3654 // doing the equivalent of insertion sort (inserting each into the previous
3655 // list, in order).
3656 // Notice that inserting a range can reduce the number of ranges in the
3657 // result due to combining of adjacent and overlapping ranges.
3658 intptr_t read = i; // Range to insert.
3659 intptr_t num_canonical = i; // Length of canonicalized part of list.
3660 do {
3661 num_canonical = InsertRangeInCanonicalList(character_ranges,
3662 num_canonical,
3663 character_ranges->At(read));
3664 read++;
3665 } while (read < n);
3666 character_ranges->TruncateTo(num_canonical);
3667
3668 ASSERT(CharacterRange::IsCanonical(character_ranges));
3669 }
3670
3671
3672 void CharacterSet::Canonicalize() {
3673 // Special/default classes are always considered canonical. The result
3674 // of calling ranges() will be sorted.
3675 if (ranges_ == NULL) return;
3676 CharacterRange::Canonicalize(ranges_);
3677 }
3678
3679
3680 ZoneGrowableArray<CharacterRange>* CharacterSet::ranges() {
3681 if (ranges_ == NULL) {
3682 ranges_ = new ZoneGrowableArray<CharacterRange>(2);
3683 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
3684 }
3685 return ranges_;
3686 }
3687
3688
3689 void CharacterRange::Negate(ZoneGrowableArray<CharacterRange>* ranges,
3690 ZoneGrowableArray<CharacterRange>* negated_ranges) {
3691 ASSERT(CharacterRange::IsCanonical(ranges));
3692 ASSERT(negated_ranges->length() == 0);
3693 intptr_t range_count = ranges->length();
3694 uint16_t from = 0;
3695 intptr_t i = 0;
3696 if (range_count > 0 && ranges->At(0).from() == 0) {
3697 from = ranges->At(0).to();
3698 i = 1;
3699 }
3700 while (i < range_count) {
3701 CharacterRange range = ranges->At(i);
3702 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
3703 from = range.to();
3704 i++;
3705 }
3706 if (from < Utf16::kMaxCodeUnit) {
3707 negated_ranges->Add(CharacterRange(from + 1, Utf16::kMaxCodeUnit));
3708 }
3709 }
3710
3711
3712 void CharacterRange::AddCaseEquivalents(
3713 ZoneGrowableArray<CharacterRange>* ranges,
3714 bool is_ascii,
3715 Isolate* isolate) {
3716 uint16_t bottom = from();
3717 uint16_t top = to();
3718 if (is_ascii && !RangeContainsLatin1Equivalents(*this)) {
3719 if (bottom > Symbols::kMaxOneCharCodeSymbol) return;
3720 if (top > Symbols::kMaxOneCharCodeSymbol) {
3721 top = Symbols::kMaxOneCharCodeSymbol;
3722 }
3723 }
3724
3725 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize;
3726 unibrow::Mapping<unibrow::CanonicalizationRange> jsregexp_canonrange;
3727 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3728 if (top == bottom) {
3729 // If this is a singleton we just expand the one character.
3730 intptr_t length = jsregexp_uncanonicalize.get(bottom, '\0', chars); // NOLIN T
3731 for (intptr_t i = 0; i < length; i++) {
3732 uint32_t chr = chars[i];
3733 if (chr != bottom) {
3734 ranges->Add(CharacterRange::Singleton(chars[i]));
3735 }
3736 }
3737 } else {
3738 // If this is a range we expand the characters block by block,
3739 // expanding contiguous subranges (blocks) one at a time.
3740 // The approach is as follows. For a given start character we
3741 // look up the remainder of the block that contains it (represented
3742 // by the end point), for instance we find 'z' if the character
3743 // is 'c'. A block is characterized by the property
3744 // that all characters uncanonicalize in the same way, except that
3745 // each entry in the result is incremented by the distance from the first
3746 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and // NOLINT
3747 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
3748 // Once we've found the end point we look up its uncanonicalization
3749 // and produce a range for each element. For instance for [c-f]
3750 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
3751 // add a range if it is not already contained in the input, so [c-f]
3752 // will be skipped but [C-F] will be added. If this range is not
3753 // completely contained in a block we do this for all the blocks
3754 // covered by the range (handling characters that is not in a block
3755 // as a "singleton block").
3756 int32_t range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3757 intptr_t pos = bottom;
3758 while (pos <= top) {
3759 intptr_t length = jsregexp_canonrange.get(pos, '\0', range);
3760 uint16_t block_end;
3761 if (length == 0) {
3762 block_end = pos;
3763 } else {
3764 ASSERT(length == 1);
3765 block_end = range[0];
3766 }
3767 intptr_t end = (block_end > top) ? top : block_end;
3768 length = jsregexp_uncanonicalize.get(block_end, '\0', range); // NOLINT
3769 for (intptr_t i = 0; i < length; i++) {
3770 uint32_t c = range[i];
3771 uint16_t range_from = c - (block_end - pos);
3772 uint16_t range_to = c - (block_end - end);
3773 if (!(bottom <= range_from && range_to <= top)) {
3774 ranges->Add(CharacterRange(range_from, range_to));
3775 }
3776 }
3777 pos = end + 1;
3778 }
3779 }
3780 }
3781
3782
3783 // -------------------------------------------------------------------
3784 // Splay tree
3785
3786
3787 // Workaround for the fact that ZoneGrowableArray does not have contains().
3788 static bool ArrayContains(ZoneGrowableArray<unsigned>* array,
3789 unsigned value) {
3790 for (intptr_t i = 0; i < array->length(); i++) {
3791 if (array->At(i) == value) {
3792 return true;
3793 }
3794 }
3795 return false;
3796 }
3797
3798
3799 void OutSet::Set(unsigned value, Isolate* isolate) {
3800 if (value < kFirstLimit) {
3801 first_ |= (1 << value);
3802 } else {
3803 if (remaining_ == NULL)
3804 remaining_ = new(isolate) ZoneGrowableArray<unsigned>(1);
3805
3806 bool remaining_contains_value = ArrayContains(remaining_, value);
3807 if (remaining_->is_empty() || !remaining_contains_value) {
3808 remaining_->Add(value);
3809 }
3810 }
3811 }
3812
3813
3814 bool OutSet::Get(unsigned value) const {
3815 if (value < kFirstLimit) {
3816 return (first_ & (1 << value)) != 0;
3817 } else if (remaining_ == NULL) {
3818 return false;
3819 } else {
3820 return ArrayContains(remaining_, value);
3821 }
3822 }
3823
3824 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3825 RegExpCompiler* compiler,
3826 intptr_t filled_in,
3827 bool not_at_start) {
3828 if (assertion_type_ == AT_START && not_at_start) {
3829 details->set_cannot_match();
3830 return;
3831 }
3832 return on_success()->GetQuickCheckDetails(details,
3833 compiler,
3834 filled_in,
3835 not_at_start);
3836 }
3837
3838
3839 void GuardedAlternative::AddGuard(Guard* guard, Isolate* isolate) {
3840 if (guards_ == NULL)
3841 guards_ = new(isolate) ZoneGrowableArray<Guard*>(1);
3842 guards_->Add(guard);
3843 }
3844
3845
3846 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3847 ASSERT(loop_node_ == NULL);
3848 AddAlternative(alt);
3849 loop_node_ = alt.node();
3850 }
3851
3852
3853 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3854 ASSERT(continue_node_ == NULL);
3855 AddAlternative(alt);
3856 continue_node_ = alt.node();
3857 }
3858
3859
3860 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
3861 visitor->VisitLoopChoice(this);
3862 }
3863
3864
3865 intptr_t LoopChoiceNode::EatsAtLeast(intptr_t still_to_find,
3866 intptr_t budget,
3867 bool not_at_start) {
3868 return EatsAtLeastHelper(still_to_find,
3869 budget - 1,
3870 loop_node_,
3871 not_at_start);
3872 }
3873
3874
3875 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
3876 RegExpCompiler* compiler,
3877 intptr_t characters_filled_in,
3878 bool not_at_start) {
3879 not_at_start = (not_at_start || not_at_start_);
3880 intptr_t choice_count = alternatives_->length();
3881 ASSERT(choice_count > 0);
3882 (*alternatives_)[0].node()->GetQuickCheckDetails(details,
3883 compiler,
3884 characters_filled_in,
3885 not_at_start);
3886 for (intptr_t i = 1; i < choice_count; i++) {
3887 QuickCheckDetails new_details(details->characters());
3888 RegExpNode* node = (*alternatives_)[i].node();
3889 node->GetQuickCheckDetails(&new_details, compiler,
3890 characters_filled_in,
3891 not_at_start);
3892 // Here we merge the quick match details of the two branches.
3893 details->Merge(&new_details, characters_filled_in);
3894 }
3895 }
3896
3897
3898 void QuickCheckDetails::Clear() {
3899 for (intptr_t i = 0; i < characters_; i++) {
3900 positions_[i].mask = 0;
3901 positions_[i].value = 0;
3902 positions_[i].determines_perfectly = false;
3903 }
3904 characters_ = 0;
3905 }
3906
3907
3908 void QuickCheckDetails::Advance(intptr_t by, bool ascii) {
3909 ASSERT(by >= 0);
3910 if (by >= characters_) {
3911 Clear();
3912 return;
3913 }
3914 for (intptr_t i = 0; i < characters_ - by; i++) {
3915 positions_[i] = positions_[by + i];
3916 }
3917 for (intptr_t i = characters_ - by; i < characters_; i++) {
3918 positions_[i].mask = 0;
3919 positions_[i].value = 0;
3920 positions_[i].determines_perfectly = false;
3921 }
3922 characters_ -= by;
3923 // We could change mask_ and value_ here but we would never advance unless
3924 // they had already been used in a check and they won't be used again because
3925 // it would gain us nothing. So there's no point.
3926 }
3927
3928
3929 void QuickCheckDetails::Merge(QuickCheckDetails* other, intptr_t from_index) {
3930 ASSERT(characters_ == other->characters_);
3931 if (other->cannot_match_) {
3932 return;
3933 }
3934 if (cannot_match_) {
3935 *this = *other;
3936 return;
3937 }
3938 for (intptr_t i = from_index; i < characters_; i++) {
3939 QuickCheckDetails::Position* pos = positions(i);
3940 QuickCheckDetails::Position* other_pos = other->positions(i);
3941 if (pos->mask != other_pos->mask ||
3942 pos->value != other_pos->value ||
3943 !other_pos->determines_perfectly) {
3944 // Our mask-compare operation will be approximate unless we have the
3945 // exact same operation on both sides of the alternation.
3946 pos->determines_perfectly = false;
3947 }
3948 pos->mask &= other_pos->mask;
3949 pos->value &= pos->mask;
3950 other_pos->value &= pos->mask;
3951 uint16_t differing_bits = (pos->value ^ other_pos->value);
3952 pos->mask &= ~differing_bits;
3953 pos->value &= pos->mask;
3954 }
3955 }
3956
3957
3958 // Finds the fixed match length of a sequence of nodes that goes from
3959 // this alternative and back to this choice node. If there are variable
3960 // length nodes or other complications in the way then return a sentinel
3961 // value indicating that a greedy loop cannot be constructed.
3962 intptr_t ChoiceNode::GreedyLoopTextLengthForAlternative(
3963 GuardedAlternative* alternative) {
3964 intptr_t length = 0;
3965 RegExpNode* node = alternative->node();
3966 // Later we will generate code for all these text nodes using recursion
3967 // so we have to limit the max number.
3968 intptr_t recursion_depth = 0;
3969 while (node != this) {
3970 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3971 return kNodeIsTooComplexForGreedyLoops;
3972 }
3973 intptr_t node_length = node->GreedyLoopTextLength();
3974 if (node_length == kNodeIsTooComplexForGreedyLoops) {
3975 return kNodeIsTooComplexForGreedyLoops;
3976 }
3977 length += node_length;
3978 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3979 node = seq_node->on_success();
3980 }
3981 return length;
3982 }
3983
3984
3985 void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
3986 QuickCheckDetails* details,
3987 RegExpCompiler* compiler,
3988 intptr_t filled_in,
3989 bool not_at_start) {
3990 // Alternative 0 is the negative lookahead, alternative 1 is what comes
3991 // afterwards.
3992 RegExpNode* node = (*alternatives_)[1].node();
3993 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
3994 }
3995
3996
3997 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
3998 return RegExpEngine::CompilationResult("RegExp too big");
3999 }
4000
4001
4002 // Attempts to compile the regexp using an Irregexp code generator. Returns
4003 // a fixed array or a null handle depending on whether it succeeded.
4004 RegExpCompiler::RegExpCompiler(intptr_t capture_count, bool ignore_case,
4005 intptr_t specialization_cid)
4006 : next_register_(2 * (capture_count + 1)),
4007 work_list_(NULL),
4008 recursion_depth_(0),
4009 ignore_case_(ignore_case),
4010 specialization_cid_(specialization_cid),
4011 reg_exp_too_big_(false),
4012 current_expansion_factor_(1),
4013 isolate_(Isolate::Current()) {
4014 accept_ = new(I) EndNode(EndNode::ACCEPT, I);
4015 }
4016
4017
4018 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
4019 IRRegExpMacroAssembler* macro_assembler,
4020 RegExpNode* start,
4021 intptr_t capture_count,
4022 const String& pattern) {
4023 static const bool use_slow_safe_regexp_compiler = false;
4024
4025 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
4026 macro_assembler_ = macro_assembler;
4027
4028 ZoneGrowableArray<RegExpNode*> work_list(0);
4029 work_list_ = &work_list;
4030 BlockLabel fail;
4031 macro_assembler_->PushBacktrack(&fail);
4032 Trace new_trace;
4033 start->Emit(this, &new_trace);
4034 macro_assembler_->BindBlock(&fail);
4035 macro_assembler_->Fail();
4036 while (!work_list.is_empty()) {
4037 work_list.RemoveLast()->Emit(this, &new_trace);
4038 }
4039 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
4040
4041 macro_assembler->FinalizeIndirectGotos();
4042
4043 return RegExpEngine::CompilationResult(macro_assembler,
4044 macro_assembler->graph_entry(),
4045 macro_assembler->num_blocks(),
4046 macro_assembler->num_stack_locals());
4047 }
4048
4049
4050 RegExpEngine::CompilationResult RegExpEngine::Compile(
4051 RegExpCompileData* data,
4052 const ParsedFunction* parsed_function,
4053 const ZoneGrowableArray<const ICData*>& ic_data_array) {
4054 Isolate* isolate = Isolate::Current();
4055
4056 const Function& function = parsed_function->function();
4057 const intptr_t specialization_cid = function.regexp_cid();
4058 const bool is_ascii = (specialization_cid == kOneByteStringCid ||
4059 specialization_cid == kExternalOneByteStringCid);
4060 JSRegExp& regexp = JSRegExp::Handle(isolate, function.regexp());
4061 const String& pattern = String::Handle(isolate, regexp.pattern());
4062
4063 ASSERT(!regexp.IsNull());
4064 ASSERT(!pattern.IsNull());
4065
4066 const bool ignore_case = regexp.is_ignore_case();
4067 const bool is_global = regexp.is_global();
4068
4069 RegExpCompiler compiler(data->capture_count, ignore_case, specialization_cid);
4070
4071 // TODO(jgruber): Frequency sampling is currently disabled because of several
4072 // issues. We do not want to store subject strings in the regexp object since
4073 // they might be long and we should not prevent their garbage collection.
4074 // Passing them to this function explicitly does not help, since we must
4075 // generate exactly the same IR for both the unoptimizing and optimizing
4076 // pipelines (otherwise it gets confused when i.e. deopt id's differ).
4077 // An option would be to store sampling results in the regexp object, but
4078 // I'm not sure the performance gains are relevant enough.
4079
4080 // Wrap the body of the regexp in capture #0.
4081 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
4082 0,
4083 &compiler,
4084 compiler.accept());
4085
4086 RegExpNode* node = captured_body;
4087 bool is_start_anchored = data->tree->IsAnchoredAtStart();
4088 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
4089 intptr_t max_length = data->tree->max_match();
4090 if (!is_start_anchored) {
4091 // Add a .*? at the beginning, outside the body capture, unless
4092 // this expression is anchored at the beginning.
4093 RegExpNode* loop_node =
4094 RegExpQuantifier::ToNode(0,
4095 RegExpTree::kInfinity,
4096 false,
4097 new(isolate) RegExpCharacterClass('*'),
4098 &compiler,
4099 captured_body,
4100 data->contains_anchor);
4101
4102 if (data->contains_anchor) {
4103 // Unroll loop once, to take care of the case that might start
4104 // at the start of input.
4105 ChoiceNode* first_step_node = new(isolate) ChoiceNode(2, isolate);
4106 first_step_node->AddAlternative(GuardedAlternative(captured_body));
4107 first_step_node->AddAlternative(GuardedAlternative(
4108 new(isolate) TextNode(
4109 new(isolate) RegExpCharacterClass('*'), loop_node)));
4110 node = first_step_node;
4111 } else {
4112 node = loop_node;
4113 }
4114 }
4115 if (is_ascii) {
4116 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
4117 // Do it again to propagate the new nodes to places where they were not
4118 // put because they had not been calculated yet.
4119 if (node != NULL) {
4120 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
4121 }
4122 }
4123
4124 if (node == NULL) node = new(isolate) EndNode(EndNode::BACKTRACK, isolate);
4125 data->node = node;
4126 Analysis analysis(ignore_case, is_ascii);
4127 analysis.EnsureAnalyzed(node);
4128 if (analysis.has_failed()) {
4129 const char* error_message = analysis.error_message();
4130 return CompilationResult(error_message);
4131 }
4132
4133 // Native regexp implementation.
4134
4135 IRRegExpMacroAssembler* macro_assembler =
4136 new(isolate) IRRegExpMacroAssembler(specialization_cid,
4137 data->capture_count,
4138 parsed_function,
4139 ic_data_array,
4140 isolate);
4141
4142 // Inserted here, instead of in Assembler, because it depends on information
4143 // in the AST that isn't replicated in the Node structure.
4144 static const intptr_t kMaxBacksearchLimit = 1024;
4145 if (is_end_anchored &&
4146 !is_start_anchored &&
4147 max_length < kMaxBacksearchLimit) {
4148 macro_assembler->SetCurrentPositionFromEnd(max_length);
4149 }
4150
4151 if (is_global) {
4152 macro_assembler->set_global_mode(
4153 (data->tree->min_match() > 0)
4154 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK
4155 : RegExpMacroAssembler::GLOBAL);
4156 }
4157
4158 RegExpEngine::CompilationResult result =
4159 compiler.Assemble(macro_assembler,
4160 node,
4161 data->capture_count,
4162 pattern);
4163
4164 if (FLAG_trace_irregexp) {
4165 macro_assembler->PrintBlocks();
4166 }
4167
4168 return result;
4169 }
4170
4171
4172 static void CreateSpecializedFunction(Isolate* isolate,
4173 const JSRegExp& regexp,
4174 intptr_t specialization_cid,
4175 const Object& owner) {
4176 const intptr_t kParamCount = RegExpMacroAssembler::kParamCount;
4177
4178 Function& fn = Function::Handle(isolate,
4179 Function::New(String::Handle(isolate, Symbols::New("RegExp")),
4180 RawFunction::kIrregexpFunction,
4181 true, // Static.
4182 false, // Not const.
4183 false, // Not abstract.
4184 false, // Not external.
4185 false, // Not native.
4186 owner,
4187 0)); // Requires a non-negative token position.
4188
4189 // TODO(jgruber): Share these arrays between all irregexp functions.
4190 fn.set_num_fixed_parameters(kParamCount);
4191 fn.set_parameter_types(Array::Handle(isolate, Array::New(kParamCount,
4192 Heap::kOld)));
4193 fn.set_parameter_names(Array::Handle(isolate, Array::New(kParamCount,
4194 Heap::kOld)));
4195 fn.SetParameterTypeAt(0, Type::Handle(isolate, Type::DynamicType()));
4196 fn.SetParameterNameAt(0, Symbols::string_param_());
4197 fn.SetParameterTypeAt(1, Type::Handle(isolate, Type::DynamicType()));
4198 fn.SetParameterNameAt(1, Symbols::start_index_param_());
4199 fn.set_result_type(Type::Handle(isolate, Type::ArrayType()));
4200
4201 // Cache the result.
4202 regexp.set_function(specialization_cid, fn);
4203
4204 fn.set_regexp(regexp);
4205 fn.set_regexp_cid(specialization_cid);
4206
4207 // The function is compiled lazily during the first call.
4208 }
4209
4210
4211 RawJSRegExp* RegExpEngine::CreateJSRegExp(Isolate* isolate,
4212 const String& pattern,
4213 bool multi_line,
4214 bool ignore_case) {
4215 const JSRegExp& regexp = JSRegExp::Handle(JSRegExp::New(0));
4216
4217 regexp.set_pattern(pattern);
4218
4219 if (multi_line) {
4220 regexp.set_is_multi_line();
4221 }
4222 if (ignore_case) {
4223 regexp.set_is_ignore_case();
4224 }
4225
4226 // TODO(jgruber): We might want to use normal string searching algorithms
4227 // for simple patterns.
4228 regexp.set_is_complex();
4229 regexp.set_is_global(); // All dart regexps are global.
4230
4231 const Library& lib = Library::Handle(isolate, Library::CoreLibrary());
4232 const Class& owner = Class::Handle(isolate,
4233 lib.LookupClass(String::Handle(isolate, Symbols::New("RegExp"))));
4234
4235 CreateSpecializedFunction(isolate, regexp, kOneByteStringCid, owner);
4236 CreateSpecializedFunction(isolate, regexp, kTwoByteStringCid, owner);
4237 CreateSpecializedFunction(isolate, regexp, kExternalOneByteStringCid, owner);
4238 CreateSpecializedFunction(isolate, regexp, kExternalTwoByteStringCid, owner);
4239
4240 return regexp.raw();
4241 }
4242
4243
4244 void BoyerMoorePositionInfo::Set(intptr_t character) {
4245 SetInterval(Interval(character, character));
4246 }
4247
4248
4249 ContainedInLattice AddRange(ContainedInLattice containment,
4250 const intptr_t* ranges,
4251 intptr_t ranges_length,
4252 Interval new_range) {
4253 ASSERT((ranges_length & 1) == 1);
4254 ASSERT(ranges[ranges_length - 1] == Utf16::kMaxCodeUnit + 1);
4255 if (containment == kLatticeUnknown) return containment;
4256 bool inside = false;
4257 intptr_t last = 0;
4258 for (intptr_t i = 0; i < ranges_length;
4259 inside = !inside, last = ranges[i], i++) {
4260 // Consider the range from last to ranges[i].
4261 // We haven't got to the new range yet.
4262 if (ranges[i] <= new_range.from()) continue;
4263 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
4264 // inclusive, but the values in ranges are not.
4265 if (last <= new_range.from() && new_range.to() < ranges[i]) {
4266 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
4267 }
4268 return kLatticeUnknown;
4269 }
4270 return containment;
4271 }
4272
4273
4274 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
4275 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
4276 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
4277 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
4278 surrogate_ =
4279 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
4280 if (interval.to() - interval.from() >= kMapSize - 1) {
4281 if (map_count_ != kMapSize) {
4282 map_count_ = kMapSize;
4283 for (intptr_t i = 0; i < kMapSize; i++) (*map_)[i] = true;
4284 }
4285 return;
4286 }
4287 for (intptr_t i = interval.from(); i <= interval.to(); i++) {
4288 intptr_t mod_character = (i & kMask);
4289 if (!map_->At(mod_character)) {
4290 map_count_++;
4291 (*map_)[mod_character] = true;
4292 }
4293 if (map_count_ == kMapSize) return;
4294 }
4295 }
4296
4297
4298 void BoyerMoorePositionInfo::SetAll() {
4299 s_ = w_ = d_ = kLatticeUnknown;
4300 if (map_count_ != kMapSize) {
4301 map_count_ = kMapSize;
4302 for (intptr_t i = 0; i < kMapSize; i++) (*map_)[i] = true;
4303 }
4304 }
4305
4306
4307 BoyerMooreLookahead::BoyerMooreLookahead(
4308 intptr_t length, RegExpCompiler* compiler, Isolate* isolate)
4309 : length_(length),
4310 compiler_(compiler) {
4311 if (compiler->ascii()) {
4312 max_char_ = Symbols::kMaxOneCharCodeSymbol;
4313 } else {
4314 max_char_ = Utf16::kMaxCodeUnit;
4315 }
4316 bitmaps_ = new(isolate) ZoneGrowableArray<BoyerMoorePositionInfo*>(length);
4317 for (intptr_t i = 0; i < length; i++) {
4318 bitmaps_->Add(new(isolate) BoyerMoorePositionInfo(isolate));
4319 }
4320 }
4321
4322
4323 // Take all the characters that will not prevent a successful match if they
4324 // occur in the subject string in the range between min_lookahead and
4325 // max_lookahead (inclusive) measured from the current position. If the
4326 // character at max_lookahead offset is not one of these characters, then we
4327 // can safely skip forwards by the number of characters in the range.
4328 intptr_t BoyerMooreLookahead::GetSkipTable(
4329 intptr_t min_lookahead,
4330 intptr_t max_lookahead,
4331 const TypedData& boolean_skip_table) {
4332 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
4333
4334 const intptr_t kSkipArrayEntry = 0;
4335 const intptr_t kDontSkipArrayEntry = 1;
4336
4337 for (intptr_t i = 0; i < kSize; i++) {
4338 boolean_skip_table.SetUint8(i, kSkipArrayEntry);
4339 }
4340 intptr_t skip = max_lookahead + 1 - min_lookahead;
4341
4342 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {
4343 BoyerMoorePositionInfo* map = bitmaps_->At(i);
4344 for (intptr_t j = 0; j < kSize; j++) {
4345 if (map->at(j)) {
4346 boolean_skip_table.SetUint8(j, kDontSkipArrayEntry);
4347 }
4348 }
4349 }
4350
4351 return skip;
4352 }
4353
4354
4355 // Find the longest range of lookahead that has the fewest number of different
4356 // characters that can occur at a given position. Since we are optimizing two
4357 // different parameters at once this is a tradeoff.
4358 bool BoyerMooreLookahead::FindWorthwhileInterval(intptr_t* from, intptr_t* to) {
4359 intptr_t biggest_points = 0;
4360 // If more than 32 characters out of 128 can occur it is unlikely that we can
4361 // be lucky enough to step forwards much of the time.
4362 const intptr_t kMaxMax = 32;
4363 for (intptr_t max_number_of_chars = 4;
4364 max_number_of_chars < kMaxMax;
4365 max_number_of_chars *= 2) {
4366 biggest_points =
4367 FindBestInterval(max_number_of_chars, biggest_points, from, to);
4368 }
4369 if (biggest_points == 0) return false;
4370 return true;
4371 }
4372
4373
4374 // Find the highest-points range between 0 and length_ where the character
4375 // information is not too vague. 'Too vague' means that there are more than
4376 // max_number_of_chars that can occur at this position. Calculates the number
4377 // of points as the product of width-of-the-range and
4378 // probability-of-finding-one-of-the-characters, where the probability is
4379 // calculated using the frequency distribution of the sample subject string.
4380 intptr_t BoyerMooreLookahead::FindBestInterval(
4381 intptr_t max_number_of_chars,
4382 intptr_t old_biggest_points,
4383 intptr_t* from,
4384 intptr_t* to) {
4385 intptr_t biggest_points = old_biggest_points;
4386 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;
4387 for (intptr_t i = 0; i < length_; ) {
4388 while (i < length_ && Count(i) > max_number_of_chars) i++;
4389 if (i == length_) break;
4390 intptr_t remembered_from = i;
4391 bool union_map[kSize];
4392 for (intptr_t j = 0; j < kSize; j++) union_map[j] = false;
4393 while (i < length_ && Count(i) <= max_number_of_chars) {
4394 BoyerMoorePositionInfo* map = bitmaps_->At(i);
4395 for (intptr_t j = 0; j < kSize; j++) union_map[j] |= map->at(j);
4396 i++;
4397 }
4398 intptr_t frequency = 0;
4399 for (intptr_t j = 0; j < kSize; j++) {
4400 if (union_map[j]) {
4401 // Add 1 to the frequency to give a small per-character boost for
4402 // the cases where our sampling is not good enough and many
4403 // characters have a frequency of zero. This means the frequency
4404 // can theoretically be up to 2*kSize though we treat it mostly as
4405 // a fraction of kSize.
4406 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
4407 }
4408 }
4409 // We use the probability of skipping times the distance we are skipping to
4410 // judge the effectiveness of this. Actually we have a cut-off: By
4411 // dividing by 2 we switch off the skipping if the probability of skipping
4412 // is less than 50%. This is because the multibyte mask-and-compare
4413 // skipping in quickcheck is more likely to do well on this case.
4414 bool in_quickcheck_range = ((i - remembered_from < 4) ||
4415 (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2));
4416 // Called 'probability' but it is only a rough estimate and can actually
4417 // be outside the 0-kSize range.
4418 intptr_t probability =
4419 (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
4420 intptr_t points = (i - remembered_from) * probability;
4421 if (points > biggest_points) {
4422 *from = remembered_from;
4423 *to = i - 1;
4424 biggest_points = points;
4425 }
4426 }
4427 return biggest_points;
4428 }
4429
4430
4431 // See comment above on the implementation of GetSkipTable.
4432 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
4433 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
4434
4435 intptr_t min_lookahead = 0;
4436 intptr_t max_lookahead = 0;
4437
4438 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false;
4439
4440 bool found_single_character = false;
4441 intptr_t single_character = 0;
4442 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {
4443 BoyerMoorePositionInfo* map = bitmaps_->At(i);
4444 if (map->map_count() > 1 ||
4445 (found_single_character && map->map_count() != 0)) {
4446 found_single_character = false;
4447 break;
4448 }
4449 for (intptr_t j = 0; j < kSize; j++) {
4450 if (map->at(j)) {
4451 found_single_character = true;
4452 single_character = j;
4453 break;
4454 }
4455 }
4456 }
4457
4458 intptr_t lookahead_width = max_lookahead + 1 - min_lookahead;
4459
4460 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
4461 // The mask-compare can probably handle this better.
4462 return false;
4463 }
4464
4465 if (found_single_character) {
4466 BlockLabel cont, again;
4467 masm->BindBlock(&again);
4468 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
4469 if (max_char_ > kSize) {
4470 masm->CheckCharacterAfterAnd(single_character,
4471 RegExpMacroAssembler::kTableMask,
4472 &cont);
4473 } else {
4474 masm->CheckCharacter(single_character, &cont);
4475 }
4476 masm->AdvanceCurrentPosition(lookahead_width);
4477 masm->GoTo(&again);
4478 masm->BindBlock(&cont);
4479 return true;
4480 }
4481
4482 const TypedData& boolean_skip_table = TypedData::ZoneHandle(
4483 compiler_->isolate(),
4484 TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));
4485 intptr_t skip_distance = GetSkipTable(
4486 min_lookahead, max_lookahead, boolean_skip_table);
4487 ASSERT(skip_distance != 0);
4488
4489 BlockLabel cont, again;
4490
4491 masm->BindBlock(&again);
4492 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
4493 masm->CheckBitInTable(boolean_skip_table, &cont);
4494 masm->AdvanceCurrentPosition(skip_distance);
4495 masm->GoTo(&again);
4496 masm->BindBlock(&cont);
4497
4498 return true;
4499 }
4500
4501
4502 // -------------------------------------------------------------------
4503 // Analysis
4504
4505
4506 void Analysis::EnsureAnalyzed(RegExpNode* that) {
4507 if (that->info()->been_analyzed || that->info()->being_analyzed)
4508 return;
4509 that->info()->being_analyzed = true;
4510 that->Accept(this);
4511 that->info()->being_analyzed = false;
4512 that->info()->been_analyzed = true;
4513 }
4514
4515
4516 void Analysis::VisitEnd(EndNode* that) {
4517 // nothing to do
4518 }
4519
4520
4521 void TextNode::CalculateOffsets() {
4522 intptr_t element_count = elements()->length();
4523 // Set up the offsets of the elements relative to the start. This is a fixed
4524 // quantity since a TextNode can only contain fixed-width things.
4525 intptr_t cp_offset = 0;
4526 for (intptr_t i = 0; i < element_count; i++) {
4527 TextElement& elm = (*elements())[i];
4528 elm.set_cp_offset(cp_offset);
4529 cp_offset += elm.length();
4530 }
4531 }
4532
4533
4534 void Analysis::VisitText(TextNode* that) {
4535 if (ignore_case_) {
4536 that->MakeCaseIndependent(is_ascii_);
4537 }
4538 EnsureAnalyzed(that->on_success());
4539 if (!has_failed()) {
4540 that->CalculateOffsets();
4541 }
4542 }
4543
4544
4545 void Analysis::VisitAction(ActionNode* that) {
4546 RegExpNode* target = that->on_success();
4547 EnsureAnalyzed(target);
4548 if (!has_failed()) {
4549 // If the next node is interested in what it follows then this node
4550 // has to be interested too so it can pass the information on.
4551 that->info()->AddFromFollowing(target->info());
4552 }
4553 }
4554
4555
4556 void Analysis::VisitChoice(ChoiceNode* that) {
4557 NodeInfo* info = that->info();
4558 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
4559 RegExpNode* node = (*that->alternatives())[i].node();
4560 EnsureAnalyzed(node);
4561 if (has_failed()) return;
4562 // Anything the following nodes need to know has to be known by
4563 // this node also, so it can pass it on.
4564 info->AddFromFollowing(node->info());
4565 }
4566 }
4567
4568
4569 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
4570 NodeInfo* info = that->info();
4571 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
4572 RegExpNode* node = (*that->alternatives())[i].node();
4573 if (node != that->loop_node()) {
4574 EnsureAnalyzed(node);
4575 if (has_failed()) return;
4576 info->AddFromFollowing(node->info());
4577 }
4578 }
4579 // Check the loop last since it may need the value of this node
4580 // to get a correct result.
4581 EnsureAnalyzed(that->loop_node());
4582 if (!has_failed()) {
4583 info->AddFromFollowing(that->loop_node()->info());
4584 }
4585 }
4586
4587
4588 void Analysis::VisitBackReference(BackReferenceNode* that) {
4589 EnsureAnalyzed(that->on_success());
4590 }
4591
4592
4593 void Analysis::VisitAssertion(AssertionNode* that) {
4594 EnsureAnalyzed(that->on_success());
4595 }
4596
4597
4598 // -------------------------------------------------------------------
4599 // Dot/dotty output
4600
4601
4602 #ifdef DEBUG
4603
4604
4605 class DotPrinter: public NodeVisitor {
4606 public:
4607 explicit DotPrinter(bool ignore_case)
4608 : ignore_case_(ignore_case) {}
4609 void PrintNode(const char* label, RegExpNode* node);
4610 void Visit(RegExpNode* node);
4611 void PrintAttributes(RegExpNode* from);
4612 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4613 #define DECLARE_VISIT(Type) \
4614 virtual void Visit##Type(Type##Node* that);
4615 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4616 #undef DECLARE_VISIT
4617 private:
4618 bool ignore_case_;
4619 };
4620
4621
4622 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4623 OS::Print("digraph G {\n graph [label=\"");
4624 for (intptr_t i = 0; label[i]; i++) {
4625 switch (label[i]) {
4626 case '\\':
4627 OS::Print("\\\\");
4628 break;
4629 case '"':
4630 OS::Print("\"");
4631 break;
4632 default:
4633 OS::Print("%c", label[i]);
4634 break;
4635 }
4636 }
4637 OS::Print("\"];\n");
4638 Visit(node);
4639 OS::Print("}\n");
4640 }
4641
4642
4643 void DotPrinter::Visit(RegExpNode* node) {
4644 if (node->info()->visited) return;
4645 node->info()->visited = true;
4646 node->Accept(this);
4647 }
4648
4649
4650 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4651 OS::Print(" n%p -> n%p [style=dotted];\n", from, on_failure);
4652 Visit(on_failure);
4653 }
4654
4655
4656 class AttributePrinter : public ValueObject {
4657 public:
4658 AttributePrinter() : first_(true) {}
4659 void PrintSeparator() {
4660 if (first_) {
4661 first_ = false;
4662 } else {
4663 OS::Print("|");
4664 }
4665 }
4666 void PrintBit(const char* name, bool value) {
4667 if (!value) return;
4668 PrintSeparator();
4669 OS::Print("{%s}", name);
4670 }
4671 void PrintPositive(const char* name, intptr_t value) {
4672 if (value < 0) return;
4673 PrintSeparator();
4674 OS::Print("{%s|%" Pd "}", name, value);
4675 }
4676
4677 private:
4678 bool first_;
4679 };
4680
4681
4682 void DotPrinter::PrintAttributes(RegExpNode* that) {
4683 OS::Print(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
4684 "margin=0.1, fontsize=10, label=\"{", that);
4685 AttributePrinter printer;
4686 NodeInfo* info = that->info();
4687 printer.PrintBit("NI", info->follows_newline_interest);
4688 printer.PrintBit("WI", info->follows_word_interest);
4689 printer.PrintBit("SI", info->follows_start_interest);
4690 BlockLabel* label = that->label();
4691 if (label->IsBound())
4692 printer.PrintPositive("@", label->Position());
4693 OS::Print("}\"];\n"
4694 " a%p -> n%p [style=dashed, color=grey, arrowhead=none];\n",
4695 that, that);
4696 }
4697
4698
4699 static const bool kPrintDispatchTable = false;
4700 void DotPrinter::VisitChoice(ChoiceNode* that) {
4701 if (kPrintDispatchTable) {
4702 OS::Print(" n%p [shape=Mrecord, label=\"", that);
4703 UNIMPLEMENTED();
4704 } else {
4705 OS::Print(" n%p [shape=Mrecord, label=\"?\"];\n", that);
4706 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
4707 GuardedAlternative alt = that->alternatives()->At(i);
4708 OS::Print(" n%p -> n%p", that, alt.node());
4709 }
4710 }
4711 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
4712 GuardedAlternative alt = that->alternatives()->At(i);
4713 alt.node()->Accept(this);
4714 }
4715 }
4716
4717
4718 void DotPrinter::VisitText(TextNode* that) {
4719 OS::Print(" n%p [label=\"", that);
4720 for (intptr_t i = 0; i < that->elements()->length(); i++) {
4721 if (i > 0) OS::Print(" ");
4722 TextElement elm = that->elements()->At(i);
4723 switch (elm.text_type()) {
4724 case TextElement::ATOM: {
4725 ZoneGrowableArray<uint16_t>* data = elm.atom()->data();
4726 for (intptr_t i = 0; i < data->length(); i++) {
4727 OS::Print("%c", static_cast<char>(data->At(i)));
4728 }
4729 break;
4730 }
4731 case TextElement::CHAR_CLASS: {
4732 RegExpCharacterClass* node = elm.char_class();
4733 OS::Print("[");
4734 if (node->is_negated()) OS::Print("^");
4735 for (intptr_t j = 0; j < node->ranges()->length(); j++) {
4736 CharacterRange range = node->ranges()->At(j);
4737 PrintUtf16(range.from());
4738 OS::Print("-");
4739 PrintUtf16(range.to());
4740 }
4741 OS::Print("]");
4742 break;
4743 }
4744 default:
4745 UNREACHABLE();
4746 }
4747 }
4748 OS::Print("\", shape=box, peripheries=2];\n");
4749 PrintAttributes(that);
4750 OS::Print(" n%p -> n%p;\n", that, that->on_success());
4751 Visit(that->on_success());
4752 }
4753
4754
4755 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4756 OS::Print(" n%p [label=\"$%" Pd "..$%" Pd "\", shape=doubleoctagon];\n",
4757 that, that->start_register(), that->end_register());
4758 PrintAttributes(that);
4759 OS::Print(" n%p -> n%p;\n", that, that->on_success());
4760 Visit(that->on_success());
4761 }
4762
4763
4764 void DotPrinter::VisitEnd(EndNode* that) {
4765 OS::Print(" n%p [style=bold, shape=point];\n", that);
4766 PrintAttributes(that);
4767 }
4768
4769
4770 void DotPrinter::VisitAssertion(AssertionNode* that) {
4771 OS::Print(" n%p [", that);
4772 switch (that->assertion_type()) {
4773 case AssertionNode::AT_END:
4774 OS::Print("label=\"$\", shape=septagon");
4775 break;
4776 case AssertionNode::AT_START:
4777 OS::Print("label=\"^\", shape=septagon");
4778 break;
4779 case AssertionNode::AT_BOUNDARY:
4780 OS::Print("label=\"\\b\", shape=septagon");
4781 break;
4782 case AssertionNode::AT_NON_BOUNDARY:
4783 OS::Print("label=\"\\B\", shape=septagon");
4784 break;
4785 case AssertionNode::AFTER_NEWLINE:
4786 OS::Print("label=\"(?<=\\n)\", shape=septagon");
4787 break;
4788 }
4789 OS::Print("];\n");
4790 PrintAttributes(that);
4791 RegExpNode* successor = that->on_success();
4792 OS::Print(" n%p -> n%p;\n", that, successor);
4793 Visit(successor);
4794 }
4795
4796
4797 void DotPrinter::VisitAction(ActionNode* that) {
4798 OS::Print(" n%p [", that);
4799 switch (that->action_type_) {
4800 case ActionNode::SET_REGISTER:
4801 OS::Print("label=\"$%" Pd ":=%" Pd "\", shape=octagon",
4802 that->data_.u_store_register.reg,
4803 that->data_.u_store_register.value);
4804 break;
4805 case ActionNode::INCREMENT_REGISTER:
4806 OS::Print("label=\"$%" Pd "++\", shape=octagon",
4807 that->data_.u_increment_register.reg);
4808 break;
4809 case ActionNode::STORE_POSITION:
4810 OS::Print("label=\"$%" Pd ":=$pos\", shape=octagon",
4811 that->data_.u_position_register.reg);
4812 break;
4813 case ActionNode::BEGIN_SUBMATCH:
4814 OS::Print("label=\"$%" Pd ":=$pos,begin\", shape=septagon",
4815 that->data_.u_submatch.current_position_register);
4816 break;
4817 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4818 OS::Print("label=\"escape\", shape=septagon");
4819 break;
4820 case ActionNode::EMPTY_MATCH_CHECK:
4821 OS::Print("label=\"$%" Pd "=$pos?,$%" Pd "<%" Pd "?\", shape=septagon",
4822 that->data_.u_empty_match_check.start_register,
4823 that->data_.u_empty_match_check.repetition_register,
4824 that->data_.u_empty_match_check.repetition_limit);
4825 break;
4826 case ActionNode::CLEAR_CAPTURES: {
4827 OS::Print("label=\"clear $%" Pd " to $%" Pd "\", shape=septagon",
4828 that->data_.u_clear_captures.range_from,
4829 that->data_.u_clear_captures.range_to);
4830 break;
4831 }
4832 }
4833 OS::Print("];\n");
4834 PrintAttributes(that);
4835 RegExpNode* successor = that->on_success();
4836 OS::Print(" n%p -> n%p;\n", that, successor);
4837 Visit(successor);
4838 }
4839
4840
4841 void RegExpEngine::DotPrint(const char* label,
4842 RegExpNode* node,
4843 bool ignore_case) {
4844 DotPrinter printer(ignore_case);
4845 printer.PrintNode(label, node);
4846 }
4847
4848
4849 #endif // DEBUG
4850
4851 } // namespace dart
OLDNEW
« no previous file with comments | « runtime/vm/regexp.h ('k') | runtime/vm/regexp_assembler.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698