Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(149)

Side by Side Diff: runtime/vm/regexp.cc

Issue 683433003: Integrate the Irregexp Regular Expression Engine. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: rebase, enable tests, remove *CodeUnitsAt Created 6 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 #include "vm/regexp.h"
6
7 #include "vm/dart_entry.h"
8 #include "vm/regexp_assembler.h"
9 #include "vm/regexp_ast.h"
10 #include "vm/unibrow-inl.h"
11 #include "vm/unicode.h"
12 #include "vm/symbols.h"
13
14 #define I (isolate())
15 #define CI (compiler->isolate())
16
17 namespace dart {
18
19 DECLARE_FLAG(bool, trace_irregexp);
20
21 // Default to generating optimized regexp code.
22 static const bool kRegexpOptimization = true;
23
24 // More makes code generation slower, less makes V8 benchmark score lower.
25 static const intptr_t kMaxLookaheadForBoyerMoore = 8;
26
27 ContainedInLattice AddRange(ContainedInLattice containment,
28 const intptr_t* ranges,
29 intptr_t ranges_length,
30 Interval new_range) {
31 ASSERT((ranges_length & 1) == 1);
32 ASSERT(ranges[ranges_length - 1] == Utf16::kMaxCodeUnit + 1);
33 if (containment == kLatticeUnknown) return containment;
34 bool inside = false;
35 intptr_t last = 0;
36 for (intptr_t i = 0; i < ranges_length;
37 inside = !inside, last = ranges[i], i++) {
38 // Consider the range from last to ranges[i].
39 // We haven't got to the new range yet.
40 if (ranges[i] <= new_range.from()) continue;
41 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
42 // inclusive, but the values in ranges are not.
43 if (last <= new_range.from() && new_range.to() < ranges[i]) {
44 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
45 }
46 return kLatticeUnknown;
47 }
48 return containment;
49 }
50
51 // -------------------------------------------------------------------
52 // Implementation of the Irregexp regular expression engine.
53 //
54 // The Irregexp regular expression engine is intended to be a complete
55 // implementation of ECMAScript regular expressions. It generates
56 // IR code that is subsequently compiled to native code.
57
58 // The Irregexp regexp engine is structured in three steps.
59 // 1) The parser generates an abstract syntax tree. See regexp_ast.cc.
60 // 2) From the AST a node network is created. The nodes are all
61 // subclasses of RegExpNode. The nodes represent states when
62 // executing a regular expression. Several optimizations are
63 // performed on the node network.
64 // 3) From the nodes we generate IR instructions that can actually
65 // execute the regular expression (perform the search). The
66 // code generation step is described in more detail below.
67
68 // Code generation.
69 //
70 // The nodes are divided into four main categories.
71 // * Choice nodes
72 // These represent places where the regular expression can
73 // match in more than one way. For example on entry to an
74 // alternation (foo|bar) or a repetition (*, +, ? or {}).
75 // * Action nodes
76 // These represent places where some action should be
77 // performed. Examples include recording the current position
78 // in the input string to a register (in order to implement
79 // captures) or other actions on register for example in order
80 // to implement the counters needed for {} repetitions.
81 // * Matching nodes
82 // These attempt to match some element part of the input string.
83 // Examples of elements include character classes, plain strings
84 // or back references.
85 // * End nodes
86 // These are used to implement the actions required on finding
87 // a successful match or failing to find a match.
88 //
89 // The code generated maintains some state as it runs. This consists of the
90 // following elements:
91 //
92 // * The capture registers. Used for string captures.
93 // * Other registers. Used for counters etc.
94 // * The current position.
95 // * The stack of backtracking information. Used when a matching node
96 // fails to find a match and needs to try an alternative.
97 //
98 // Conceptual regular expression execution model:
99 //
100 // There is a simple conceptual model of regular expression execution
101 // which will be presented first. The actual code generated is a more
102 // efficient simulation of the simple conceptual model:
103 //
104 // * Choice nodes are implemented as follows:
105 // For each choice except the last {
106 // push current position
107 // push backtrack code location
108 // <generate code to test for choice>
109 // backtrack code location:
110 // pop current position
111 // }
112 // <generate code to test for last choice>
113 //
114 // * Actions nodes are generated as follows
115 // <push affected registers on backtrack stack>
116 // <generate code to perform action>
117 // push backtrack code location
118 // <generate code to test for following nodes>
119 // backtrack code location:
120 // <pop affected registers to restore their state>
121 // <pop backtrack location from stack and go to it>
122 //
123 // * Matching nodes are generated as follows:
124 // if input string matches at current position
125 // update current position
126 // <generate code to test for following nodes>
127 // else
128 // <pop backtrack location from stack and go to it>
129 //
130 // Thus it can be seen that the current position is saved and restored
131 // by the choice nodes, whereas the registers are saved and restored by
132 // by the action nodes that manipulate them.
133 //
134 // The other interesting aspect of this model is that nodes are generated
135 // at the point where they are needed by a recursive call to Emit(). If
136 // the node has already been code generated then the Emit() call will
137 // generate a jump to the previously generated code instead. In order to
138 // limit recursion it is possible for the Emit() function to put the node
139 // on a work list for later generation and instead generate a jump. The
140 // destination of the jump is resolved later when the code is generated.
141 //
142 // Actual regular expression code generation.
143 //
144 // Code generation is actually more complicated than the above. In order
145 // to improve the efficiency of the generated code some optimizations are
146 // performed
147 //
148 // * Choice nodes have 1-character lookahead.
149 // A choice node looks at the following character and eliminates some of
150 // the choices immediately based on that character. This is not yet
151 // implemented.
152 // * Simple greedy loops store reduced backtracking information.
153 // A quantifier like /.*foo/m will greedily match the whole input. It will
154 // then need to backtrack to a point where it can match "foo". The naive
155 // implementation of this would push each character position onto the
156 // backtracking stack, then pop them off one by one. This would use space
157 // proportional to the length of the input string. However since the "."
158 // can only match in one way and always has a constant length (in this case
159 // of 1) it suffices to store the current position on the top of the stack
160 // once. Matching now becomes merely incrementing the current position and
161 // backtracking becomes decrementing the current position and checking the
162 // result against the stored current position. This is faster and saves
163 // space.
164 // * The current state is virtualized.
165 // This is used to defer expensive operations until it is clear that they
166 // are needed and to generate code for a node more than once, allowing
167 // specialized an efficient versions of the code to be created. This is
168 // explained in the section below.
169 //
170 // Execution state virtualization.
171 //
172 // Instead of emitting code, nodes that manipulate the state can record their
173 // manipulation in an object called the Trace. The Trace object can record a
174 // current position offset, an optional backtrack code location on the top of
175 // the virtualized backtrack stack and some register changes. When a node is
176 // to be emitted it can flush the Trace or update it. Flushing the Trace
177 // will emit code to bring the actual state into line with the virtual state.
178 // Avoiding flushing the state can postpone some work (e.g. updates of capture
179 // registers). Postponing work can save time when executing the regular
180 // expression since it may be found that the work never has to be done as a
181 // failure to match can occur. In addition it is much faster to jump to a
182 // known backtrack code location than it is to pop an unknown backtrack
183 // location from the stack and jump there.
184 //
185 // The virtual state found in the Trace affects code generation. For example
186 // the virtual state contains the difference between the actual current
187 // position and the virtual current position, and matching code needs to use
188 // this offset to attempt a match in the correct location of the input
189 // string. Therefore code generated for a non-trivial trace is specialized
190 // to that trace. The code generator therefore has the ability to generate
191 // code for each node several times. In order to limit the size of the
192 // generated code there is an arbitrary limit on how many specialized sets of
193 // code may be generated for a given node. If the limit is reached, the
194 // trace is flushed and a generic version of the code for a node is emitted.
195 // This is subsequently used for that node. The code emitted for non-generic
196 // trace is not recorded in the node and so it cannot currently be reused in
197 // the event that code generation is requested for an identical trace.
198
199
200 void RegExpTree::AppendToText(RegExpText* text) {
201 UNREACHABLE();
202 }
203
204
205 void RegExpAtom::AppendToText(RegExpText* text) {
206 text->AddElement(TextElement::Atom(this));
207 }
208
209
210 void RegExpCharacterClass::AppendToText(RegExpText* text) {
211 text->AddElement(TextElement::CharClass(this));
212 }
213
214
215 void RegExpText::AppendToText(RegExpText* text) {
216 for (intptr_t i = 0; i < elements()->length(); i++)
217 text->AddElement((*elements())[i]);
218 }
219
220
221 TextElement TextElement::Atom(RegExpAtom* atom) {
222 return TextElement(ATOM, atom);
223 }
224
225
226 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
227 return TextElement(CHAR_CLASS, char_class);
228 }
229
230
231 intptr_t TextElement::length() const {
232 switch (text_type()) {
233 case ATOM:
234 return atom()->length();
235
236 case CHAR_CLASS:
237 return 1;
238 }
239 UNREACHABLE();
240 return 0;
241 }
242
243
244 class FrequencyCollator : public ValueObject {
245 public:
246 FrequencyCollator() : total_samples_(0) {
247 for (intptr_t i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
248 frequencies_[i] = CharacterFrequency(i);
249 }
250 }
251
252 void CountCharacter(intptr_t character) {
253 intptr_t index = (character & RegExpMacroAssembler::kTableMask);
254 frequencies_[index].Increment();
255 total_samples_++;
256 }
257
258 // Does not measure in percent, but rather per-128 (the table size from the
259 // regexp macro assembler).
260 intptr_t Frequency(intptr_t in_character) {
261 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);
262 if (total_samples_ < 1) return 1; // Division by zero.
263 intptr_t freq_in_per128 =
264 (frequencies_[in_character].counter() * 128) / total_samples_;
265 return freq_in_per128;
266 }
267
268 private:
269 class CharacterFrequency {
270 public:
271 CharacterFrequency() : counter_(0), character_(-1) { }
272 explicit CharacterFrequency(intptr_t character)
273 : counter_(0), character_(character) { }
274
275 void Increment() { counter_++; }
276 intptr_t counter() { return counter_; }
277 intptr_t character() { return character_; }
278
279 private:
280 intptr_t counter_;
281 intptr_t character_;
282
283 DISALLOW_ALLOCATION();
284 };
285
286
287 private:
288 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
289 intptr_t total_samples_;
290 };
291
292
293 class RegExpCompiler : public ValueObject {
294 public:
295 RegExpCompiler(intptr_t capture_count,
296 bool ignore_case,
297 intptr_t specialization_cid);
298
299 intptr_t AllocateRegister() {
300 return next_register_++;
301 }
302
303 RegExpEngine::CompilationResult Assemble(IRRegExpMacroAssembler* assembler,
304 RegExpNode* start,
305 intptr_t capture_count,
306 const String& pattern);
307
308 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
309
310 static const intptr_t kImplementationOffset = 0;
311 static const intptr_t kNumberOfRegistersOffset = 0;
312 static const intptr_t kCodeOffset = 1;
313
314 IRRegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
315 EndNode* accept() { return accept_; }
316
317 static const intptr_t kMaxRecursion = 100;
318 inline intptr_t recursion_depth() { return recursion_depth_; }
319 inline void IncrementRecursionDepth() { recursion_depth_++; }
320 inline void DecrementRecursionDepth() { recursion_depth_--; }
321
322 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
323
324 inline bool ignore_case() { return ignore_case_; }
325 inline bool one_byte() const {
326 return (specialization_cid_ == kOneByteStringCid ||
327 specialization_cid_ == kExternalOneByteStringCid);
328 }
329 inline intptr_t specialization_cid() { return specialization_cid_; }
330 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
331
332 intptr_t current_expansion_factor() { return current_expansion_factor_; }
333 void set_current_expansion_factor(intptr_t value) {
334 current_expansion_factor_ = value;
335 }
336
337 Isolate* isolate() const { return isolate_; }
338
339 static const intptr_t kNoRegister = -1;
340
341 private:
342 EndNode* accept_;
343 intptr_t next_register_;
344 ZoneGrowableArray<RegExpNode*>* work_list_;
345 intptr_t recursion_depth_;
346 IRRegExpMacroAssembler* macro_assembler_;
347 bool ignore_case_;
348 intptr_t specialization_cid_;
349 bool reg_exp_too_big_;
350 intptr_t current_expansion_factor_;
351 FrequencyCollator frequency_collator_;
352 Isolate* isolate_;
353 };
354
355
356 class RecursionCheck : public ValueObject {
357 public:
358 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
359 compiler->IncrementRecursionDepth();
360 }
361 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
362 private:
363 RegExpCompiler* compiler_;
364 };
365
366
367 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
368 return RegExpEngine::CompilationResult("RegExp too big");
369 }
370
371
372 // Attempts to compile the regexp using an Irregexp code generator. Returns
373 // a fixed array or a null handle depending on whether it succeeded.
374 RegExpCompiler::RegExpCompiler(intptr_t capture_count, bool ignore_case,
375 intptr_t specialization_cid)
376 : next_register_(2 * (capture_count + 1)),
377 work_list_(NULL),
378 recursion_depth_(0),
379 ignore_case_(ignore_case),
380 specialization_cid_(specialization_cid),
381 reg_exp_too_big_(false),
382 current_expansion_factor_(1),
383 isolate_(Isolate::Current()) {
384 accept_ = new(I) EndNode(EndNode::ACCEPT, I);
385 }
386
387
388 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
389 IRRegExpMacroAssembler* macro_assembler,
390 RegExpNode* start,
391 intptr_t capture_count,
392 const String& pattern) {
393 static const bool use_slow_safe_regexp_compiler = false;
394
395 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
396 macro_assembler_ = macro_assembler;
397
398 ZoneGrowableArray<RegExpNode*> work_list(0);
399 work_list_ = &work_list;
400 BlockLabel fail;
401 macro_assembler_->PushBacktrack(&fail);
402 Trace new_trace;
403 start->Emit(this, &new_trace);
404 macro_assembler_->BindBlock(&fail);
405 macro_assembler_->Fail();
406 while (!work_list.is_empty()) {
407 work_list.RemoveLast()->Emit(this, &new_trace);
408 }
409 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
410
411 macro_assembler->FinalizeIndirectGotos();
412
413 return RegExpEngine::CompilationResult(macro_assembler,
414 macro_assembler->graph_entry(),
415 macro_assembler->num_blocks(),
416 macro_assembler->num_stack_locals());
417 }
418
419
420 bool Trace::DeferredAction::Mentions(intptr_t that) {
421 if (action_type() == ActionNode::CLEAR_CAPTURES) {
422 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
423 return range.Contains(that);
424 } else {
425 return reg() == that;
426 }
427 }
428
429
430 bool Trace::mentions_reg(intptr_t reg) {
431 for (DeferredAction* action = actions_;
432 action != NULL;
433 action = action->next()) {
434 if (action->Mentions(reg))
435 return true;
436 }
437 return false;
438 }
439
440
441 bool Trace::GetStoredPosition(intptr_t reg, intptr_t* cp_offset) {
442 ASSERT(*cp_offset == 0);
443 for (DeferredAction* action = actions_;
444 action != NULL;
445 action = action->next()) {
446 if (action->Mentions(reg)) {
447 if (action->action_type() == ActionNode::STORE_POSITION) {
448 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
449 return true;
450 } else {
451 return false;
452 }
453 }
454 }
455 return false;
456 }
457
458
459 // This is called as we come into a loop choice node and some other tricky
460 // nodes. It normalizes the state of the code generator to ensure we can
461 // generate generic code.
462 intptr_t Trace::FindAffectedRegisters(OutSet* affected_registers,
463 Isolate* isolate) {
464 intptr_t max_register = RegExpCompiler::kNoRegister;
465 for (DeferredAction* action = actions_;
466 action != NULL;
467 action = action->next()) {
468 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
469 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
470 for (intptr_t i = range.from(); i <= range.to(); i++)
471 affected_registers->Set(i, isolate);
472 if (range.to() > max_register) max_register = range.to();
473 } else {
474 affected_registers->Set(action->reg(), isolate);
475 if (action->reg() > max_register) max_register = action->reg();
476 }
477 }
478 return max_register;
479 }
480
481
482 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
483 intptr_t max_register,
484 const OutSet& registers_to_pop,
485 const OutSet& registers_to_clear) {
486 for (intptr_t reg = max_register; reg >= 0; reg--) {
487 if (registers_to_pop.Get(reg)) {
488 assembler->PopRegister(reg);
489 } else if (registers_to_clear.Get(reg)) {
490 intptr_t clear_to = reg;
491 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
492 reg--;
493 }
494 assembler->ClearRegisters(reg, clear_to);
495 }
496 }
497 }
498
499
500 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
501 intptr_t max_register,
502 const OutSet& affected_registers,
503 OutSet* registers_to_pop,
504 OutSet* registers_to_clear,
505 Isolate* isolate) {
506 for (intptr_t reg = 0; reg <= max_register; reg++) {
507 if (!affected_registers.Get(reg)) {
508 continue;
509 }
510
511 // The chronologically first deferred action in the trace
512 // is used to infer the action needed to restore a register
513 // to its previous state (or not, if it's safe to ignore it).
514 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
515 DeferredActionUndoType undo_action = IGNORE;
516
517 intptr_t value = 0;
518 bool absolute = false;
519 bool clear = false;
520 intptr_t store_position = -1;
521 // This is a little tricky because we are scanning the actions in reverse
522 // historical order (newest first).
523 for (DeferredAction* action = actions_;
524 action != NULL;
525 action = action->next()) {
526 if (action->Mentions(reg)) {
527 switch (action->action_type()) {
528 case ActionNode::SET_REGISTER: {
529 Trace::DeferredSetRegister* psr =
530 static_cast<Trace::DeferredSetRegister*>(action);
531 if (!absolute) {
532 value += psr->value();
533 absolute = true;
534 }
535 // SET_REGISTER is currently only used for newly introduced loop
536 // counters. They can have a significant previous value if they
537 // occour in a loop. TODO(lrn): Propagate this information, so
538 // we can set undo_action to IGNORE if we know there is no value to
539 // restore.
540 undo_action = RESTORE;
541 ASSERT(store_position == -1);
542 ASSERT(!clear);
543 break;
544 }
545 case ActionNode::INCREMENT_REGISTER:
546 if (!absolute) {
547 value++;
548 }
549 ASSERT(store_position == -1);
550 ASSERT(!clear);
551 undo_action = RESTORE;
552 break;
553 case ActionNode::STORE_POSITION: {
554 Trace::DeferredCapture* pc =
555 static_cast<Trace::DeferredCapture*>(action);
556 if (!clear && store_position == -1) {
557 store_position = pc->cp_offset();
558 }
559
560 // For captures we know that stores and clears alternate.
561 // Other register, are never cleared, and if the occur
562 // inside a loop, they might be assigned more than once.
563 if (reg <= 1) {
564 // Registers zero and one, aka "capture zero", is
565 // always set correctly if we succeed. There is no
566 // need to undo a setting on backtrack, because we
567 // will set it again or fail.
568 undo_action = IGNORE;
569 } else {
570 undo_action = pc->is_capture() ? CLEAR : RESTORE;
571 }
572 ASSERT(!absolute);
573 ASSERT(value == 0);
574 break;
575 }
576 case ActionNode::CLEAR_CAPTURES: {
577 // Since we're scanning in reverse order, if we've already
578 // set the position we have to ignore historically earlier
579 // clearing operations.
580 if (store_position == -1) {
581 clear = true;
582 }
583 undo_action = RESTORE;
584 ASSERT(!absolute);
585 ASSERT(value == 0);
586 break;
587 }
588 default:
589 UNREACHABLE();
590 break;
591 }
592 }
593 }
594 // Prepare for the undo-action (e.g., push if it's going to be popped).
595 if (undo_action == RESTORE) {
596 assembler->PushRegister(reg);
597 registers_to_pop->Set(reg, isolate);
598 } else if (undo_action == CLEAR) {
599 registers_to_clear->Set(reg, isolate);
600 }
601 // Perform the chronologically last action (or accumulated increment)
602 // for the register.
603 if (store_position != -1) {
604 assembler->WriteCurrentPositionToRegister(reg, store_position);
605 } else if (clear) {
606 assembler->ClearRegisters(reg, reg);
607 } else if (absolute) {
608 assembler->SetRegister(reg, value);
609 } else if (value != 0) {
610 assembler->AdvanceRegister(reg, value);
611 }
612 }
613 }
614
615
616 // This is called as we come into a loop choice node and some other tricky
617 // nodes. It normalizes the state of the code generator to ensure we can
618 // generate generic code.
619 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
620 RegExpMacroAssembler* assembler = compiler->macro_assembler();
621
622 ASSERT(!is_trivial());
623
624 if (actions_ == NULL && backtrack() == NULL) {
625 // Here we just have some deferred cp advances to fix and we are back to
626 // a normal situation. We may also have to forget some information gained
627 // through a quick check that was already performed.
628 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
629 // Create a new trivial state and generate the node with that.
630 Trace new_state;
631 successor->Emit(compiler, &new_state);
632 return;
633 }
634
635 // Generate deferred actions here along with code to undo them again.
636 OutSet affected_registers;
637
638 if (backtrack() != NULL) {
639 // Here we have a concrete backtrack location. These are set up by choice
640 // nodes and so they indicate that we have a deferred save of the current
641 // position which we may need to emit here.
642 assembler->PushCurrentPosition();
643 }
644
645 intptr_t max_register = FindAffectedRegisters(&affected_registers, CI);
646 OutSet registers_to_pop;
647 OutSet registers_to_clear;
648 PerformDeferredActions(assembler,
649 max_register,
650 affected_registers,
651 &registers_to_pop,
652 &registers_to_clear,
653 CI);
654 if (cp_offset_ != 0) {
655 assembler->AdvanceCurrentPosition(cp_offset_);
656 }
657
658 // Create a new trivial state and generate the node with that.
659 BlockLabel undo;
660 assembler->PushBacktrack(&undo);
661 Trace new_state;
662 successor->Emit(compiler, &new_state);
663
664 // On backtrack we need to restore state.
665 assembler->BindBlock(&undo);
666 RestoreAffectedRegisters(assembler,
667 max_register,
668 registers_to_pop,
669 registers_to_clear);
670 if (backtrack() == NULL) {
671 assembler->Backtrack();
672 } else {
673 assembler->PopCurrentPosition();
674 assembler->GoTo(backtrack());
675 }
676 }
677
678
679 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
680 RegExpMacroAssembler* assembler = compiler->macro_assembler();
681
682 // Omit flushing the trace. We discard the entire stack frame anyway.
683
684 if (!label()->IsBound()) {
685 // We are completely independent of the trace, since we ignore it,
686 // so this code can be used as the generic version.
687 assembler->BindBlock(label());
688 }
689
690 // Throw away everything on the backtrack stack since the start
691 // of the negative submatch and restore the character position.
692 assembler->ReadCurrentPositionFromRegister(current_position_register_);
693 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
694 if (clear_capture_count_ > 0) {
695 // Clear any captures that might have been performed during the success
696 // of the body of the negative look-ahead.
697 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
698 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
699 }
700 // Now that we have unwound the stack we find at the top of the stack the
701 // backtrack that the BeginSubmatch node got.
702 assembler->Backtrack();
703 }
704
705
706 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
707 if (!trace->is_trivial()) {
708 trace->Flush(compiler, this);
709 return;
710 }
711 RegExpMacroAssembler* assembler = compiler->macro_assembler();
712 if (!label()->IsBound()) {
713 assembler->BindBlock(label());
714 }
715 switch (action_) {
716 case ACCEPT:
717 assembler->Succeed();
718 return;
719 case BACKTRACK:
720 assembler->GoTo(trace->backtrack());
721 return;
722 case NEGATIVE_SUBMATCH_SUCCESS:
723 // This case is handled in a different virtual method.
724 UNREACHABLE();
725 }
726 UNIMPLEMENTED();
727 }
728
729
730 void GuardedAlternative::AddGuard(Guard* guard, Isolate* isolate) {
731 if (guards_ == NULL)
732 guards_ = new(isolate) ZoneGrowableArray<Guard*>(1);
733 guards_->Add(guard);
734 }
735
736
737 ActionNode* ActionNode::SetRegister(intptr_t reg,
738 intptr_t val,
739 RegExpNode* on_success) {
740 ActionNode* result =
741 new(on_success->isolate()) ActionNode(SET_REGISTER, on_success);
742 result->data_.u_store_register.reg = reg;
743 result->data_.u_store_register.value = val;
744 return result;
745 }
746
747
748 ActionNode* ActionNode::IncrementRegister(intptr_t reg,
749 RegExpNode* on_success) {
750 ActionNode* result =
751 new(on_success->isolate()) ActionNode(INCREMENT_REGISTER, on_success);
752 result->data_.u_increment_register.reg = reg;
753 return result;
754 }
755
756
757 ActionNode* ActionNode::StorePosition(intptr_t reg,
758 bool is_capture,
759 RegExpNode* on_success) {
760 ActionNode* result =
761 new(on_success->isolate()) ActionNode(STORE_POSITION, on_success);
762 result->data_.u_position_register.reg = reg;
763 result->data_.u_position_register.is_capture = is_capture;
764 return result;
765 }
766
767
768 ActionNode* ActionNode::ClearCaptures(Interval range,
769 RegExpNode* on_success) {
770 ActionNode* result =
771 new(on_success->isolate()) ActionNode(CLEAR_CAPTURES, on_success);
772 result->data_.u_clear_captures.range_from = range.from();
773 result->data_.u_clear_captures.range_to = range.to();
774 return result;
775 }
776
777
778 ActionNode* ActionNode::BeginSubmatch(intptr_t stack_reg,
779 intptr_t position_reg,
780 RegExpNode* on_success) {
781 ActionNode* result =
782 new(on_success->isolate()) ActionNode(BEGIN_SUBMATCH, on_success);
783 result->data_.u_submatch.stack_pointer_register = stack_reg;
784 result->data_.u_submatch.current_position_register = position_reg;
785 return result;
786 }
787
788
789 ActionNode* ActionNode::PositiveSubmatchSuccess(intptr_t stack_reg,
790 intptr_t position_reg,
791 intptr_t clear_register_count,
792 intptr_t clear_register_from,
793 RegExpNode* on_success) {
794 ActionNode* result =
795 new(on_success->isolate()) ActionNode(POSITIVE_SUBMATCH_SUCCESS,
796 on_success);
797 result->data_.u_submatch.stack_pointer_register = stack_reg;
798 result->data_.u_submatch.current_position_register = position_reg;
799 result->data_.u_submatch.clear_register_count = clear_register_count;
800 result->data_.u_submatch.clear_register_from = clear_register_from;
801 return result;
802 }
803
804
805 ActionNode* ActionNode::EmptyMatchCheck(intptr_t start_register,
806 intptr_t repetition_register,
807 intptr_t repetition_limit,
808 RegExpNode* on_success) {
809 ActionNode* result =
810 new(on_success->isolate()) ActionNode(EMPTY_MATCH_CHECK, on_success);
811 result->data_.u_empty_match_check.start_register = start_register;
812 result->data_.u_empty_match_check.repetition_register = repetition_register;
813 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
814 return result;
815 }
816
817
818 #define DEFINE_ACCEPT(Type) \
819 void Type##Node::Accept(NodeVisitor* visitor) { \
820 visitor->Visit##Type(this); \
821 }
822 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
823 #undef DEFINE_ACCEPT
824
825
826 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
827 visitor->VisitLoopChoice(this);
828 }
829
830
831 // -------------------------------------------------------------------
832 // Emit code.
833
834
835 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
836 Guard* guard,
837 Trace* trace) {
838 switch (guard->op()) {
839 case Guard::LT:
840 ASSERT(!trace->mentions_reg(guard->reg()));
841 macro_assembler->IfRegisterGE(guard->reg(),
842 guard->value(),
843 trace->backtrack());
844 break;
845 case Guard::GEQ:
846 ASSERT(!trace->mentions_reg(guard->reg()));
847 macro_assembler->IfRegisterLT(guard->reg(),
848 guard->value(),
849 trace->backtrack());
850 break;
851 }
852 }
853
854
855 // Returns the number of characters in the equivalence class, omitting those
856 // that cannot occur in the source string because it is ASCII.
857 static intptr_t GetCaseIndependentLetters(uint16_t character,
858 bool one_byte_subject,
859 int32_t* letters) {
860 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize;
861 intptr_t length = jsregexp_uncanonicalize.get(character, '\0', letters);
862 // Unibrow returns 0 or 1 for characters where case independence is
863 // trivial.
864 if (length == 0) {
865 letters[0] = character;
866 length = 1;
867 }
868 if (!one_byte_subject || character <= Symbols::kMaxOneCharCodeSymbol) {
869 return length;
870 }
871
872 // The standard requires that non-ASCII characters cannot have ASCII
873 // character codes in their equivalence class.
874 // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore,
875 // is it? For example, \u00C5 is equivalent to \u212B.
876 return 0;
877 }
878
879
880 static inline bool EmitSimpleCharacter(Isolate* isolate,
881 RegExpCompiler* compiler,
882 uint16_t c,
883 BlockLabel* on_failure,
884 intptr_t cp_offset,
885 bool check,
886 bool preloaded) {
887 RegExpMacroAssembler* assembler = compiler->macro_assembler();
888 bool bound_checked = false;
889 if (!preloaded) {
890 assembler->LoadCurrentCharacter(
891 cp_offset,
892 on_failure,
893 check);
894 bound_checked = true;
895 }
896 assembler->CheckNotCharacter(c, on_failure);
897 return bound_checked;
898 }
899
900
901 // Only emits non-letters (things that don't have case). Only used for case
902 // independent matches.
903 static inline bool EmitAtomNonLetter(Isolate* isolate,
904 RegExpCompiler* compiler,
905 uint16_t c,
906 BlockLabel* on_failure,
907 intptr_t cp_offset,
908 bool check,
909 bool preloaded) {
910 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
911 bool one_byte = compiler->one_byte();
912 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
913 intptr_t length = GetCaseIndependentLetters(c, one_byte, chars);
914 if (length < 1) {
915 // This can't match. Must be an one-byte subject and a non-one-byte
916 // character. We do not need to do anything since the one-byte pass
917 // already handled this.
918 return false; // Bounds not checked.
919 }
920 bool checked = false;
921 // We handle the length > 1 case in a later pass.
922 if (length == 1) {
923 if (one_byte && c > Symbols::kMaxOneCharCodeSymbol) {
924 // Can't match - see above.
925 return false; // Bounds not checked.
926 }
927 if (!preloaded) {
928 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
929 checked = check;
930 }
931 macro_assembler->CheckNotCharacter(c, on_failure);
932 }
933 return checked;
934 }
935
936
937 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
938 bool one_byte,
939 uint16_t c1,
940 uint16_t c2,
941 BlockLabel* on_failure) {
942 uint16_t char_mask;
943 if (one_byte) {
944 char_mask = Symbols::kMaxOneCharCodeSymbol;
945 } else {
946 char_mask = Utf16::kMaxCodeUnit;
947 }
948 uint16_t exor = c1 ^ c2;
949 // Check whether exor has only one bit set.
950 if (((exor - 1) & exor) == 0) {
951 // If c1 and c2 differ only by one bit.
952 // Ecma262UnCanonicalize always gives the highest number last.
953 ASSERT(c2 > c1);
954 uint16_t mask = char_mask ^ exor;
955 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
956 return true;
957 }
958 ASSERT(c2 > c1);
959 uint16_t diff = c2 - c1;
960 if (((diff - 1) & diff) == 0 && c1 >= diff) {
961 // If the characters differ by 2^n but don't differ by one bit then
962 // subtract the difference from the found character, then do the or
963 // trick. We avoid the theoretical case where negative numbers are
964 // involved in order to simplify code generation.
965 uint16_t mask = char_mask ^ diff;
966 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
967 diff,
968 mask,
969 on_failure);
970 return true;
971 }
972 return false;
973 }
974
975
976 typedef bool EmitCharacterFunction(Isolate* isolate,
977 RegExpCompiler* compiler,
978 uint16_t c,
979 BlockLabel* on_failure,
980 intptr_t cp_offset,
981 bool check,
982 bool preloaded);
983
984 // Only emits letters (things that have case). Only used for case independent
985 // matches.
986 static inline bool EmitAtomLetter(Isolate* isolate,
987 RegExpCompiler* compiler,
988 uint16_t c,
989 BlockLabel* on_failure,
990 intptr_t cp_offset,
991 bool check,
992 bool preloaded) {
993 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
994 bool one_byte = compiler->one_byte();
995 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
996 intptr_t length = GetCaseIndependentLetters(c, one_byte, chars);
997 if (length <= 1) return false;
998 // We may not need to check against the end of the input string
999 // if this character lies before a character that matched.
1000 if (!preloaded) {
1001 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1002 }
1003 BlockLabel ok;
1004 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1005 switch (length) {
1006 case 2: {
1007 if (ShortCutEmitCharacterPair(macro_assembler,
1008 one_byte,
1009 chars[0],
1010 chars[1],
1011 on_failure)) {
1012 } else {
1013 macro_assembler->CheckCharacter(chars[0], &ok);
1014 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1015 macro_assembler->BindBlock(&ok);
1016 }
1017 break;
1018 }
1019 case 4:
1020 macro_assembler->CheckCharacter(chars[3], &ok);
1021 // Fall through!
1022 case 3:
1023 macro_assembler->CheckCharacter(chars[0], &ok);
1024 macro_assembler->CheckCharacter(chars[1], &ok);
1025 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1026 macro_assembler->BindBlock(&ok);
1027 break;
1028 default:
1029 UNREACHABLE();
1030 break;
1031 }
1032 return true;
1033 }
1034
1035
1036 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1037 intptr_t border,
1038 BlockLabel* fall_through,
1039 BlockLabel* above_or_equal,
1040 BlockLabel* below) {
1041 if (below != fall_through) {
1042 masm->CheckCharacterLT(border, below);
1043 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1044 } else {
1045 masm->CheckCharacterGT(border - 1, above_or_equal);
1046 }
1047 }
1048
1049
1050 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1051 intptr_t first,
1052 intptr_t last,
1053 BlockLabel* fall_through,
1054 BlockLabel* in_range,
1055 BlockLabel* out_of_range) {
1056 if (in_range == fall_through) {
1057 if (first == last) {
1058 masm->CheckNotCharacter(first, out_of_range);
1059 } else {
1060 masm->CheckCharacterNotInRange(first, last, out_of_range);
1061 }
1062 } else {
1063 if (first == last) {
1064 masm->CheckCharacter(first, in_range);
1065 } else {
1066 masm->CheckCharacterInRange(first, last, in_range);
1067 }
1068 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1069 }
1070 }
1071
1072
1073 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1074 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1075 static void EmitUseLookupTable(
1076 RegExpMacroAssembler* masm,
1077 ZoneGrowableArray<int>* ranges,
1078 intptr_t start_index,
1079 intptr_t end_index,
1080 intptr_t min_char,
1081 BlockLabel* fall_through,
1082 BlockLabel* even_label,
1083 BlockLabel* odd_label) {
1084 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;
1085 static const intptr_t kMask = RegExpMacroAssembler::kTableMask;
1086
1087 intptr_t base = (min_char & ~kMask);
1088
1089 // Assert that everything is on one kTableSize page.
1090 for (intptr_t i = start_index; i <= end_index; i++) {
1091 ASSERT((ranges->At(i) & ~kMask) == base);
1092 }
1093 ASSERT(start_index == 0 || (ranges->At(start_index - 1) & ~kMask) <= base);
1094
1095 char templ[kSize];
1096 BlockLabel* on_bit_set;
1097 BlockLabel* on_bit_clear;
1098 intptr_t bit;
1099 if (even_label == fall_through) {
1100 on_bit_set = odd_label;
1101 on_bit_clear = even_label;
1102 bit = 1;
1103 } else {
1104 on_bit_set = even_label;
1105 on_bit_clear = odd_label;
1106 bit = 0;
1107 }
1108 for (intptr_t i = 0; i < (ranges->At(start_index) & kMask) && i < kSize;
1109 i++) {
1110 templ[i] = bit;
1111 }
1112 intptr_t j = 0;
1113 bit ^= 1;
1114 for (intptr_t i = start_index; i < end_index; i++) {
1115 for (j = (ranges->At(i) & kMask); j < (ranges->At(i + 1) & kMask); j++) {
1116 templ[j] = bit;
1117 }
1118 bit ^= 1;
1119 }
1120 for (intptr_t i = j; i < kSize; i++) {
1121 templ[i] = bit;
1122 }
1123 // TODO(erikcorry): Cache these.
1124 const TypedData& ba = TypedData::ZoneHandle(
1125 masm->isolate(),
1126 TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));
1127 for (intptr_t i = 0; i < kSize; i++) {
1128 ba.SetUint8(i, templ[i]);
1129 }
1130 masm->CheckBitInTable(ba, on_bit_set);
1131 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1132 }
1133
1134
1135 static void CutOutRange(RegExpMacroAssembler* masm,
1136 ZoneGrowableArray<int>* ranges,
1137 intptr_t start_index,
1138 intptr_t end_index,
1139 intptr_t cut_index,
1140 BlockLabel* even_label,
1141 BlockLabel* odd_label) {
1142 bool odd = (((cut_index - start_index) & 1) == 1);
1143 BlockLabel* in_range_label = odd ? odd_label : even_label;
1144 BlockLabel dummy;
1145 EmitDoubleBoundaryTest(masm,
1146 ranges->At(cut_index),
1147 ranges->At(cut_index + 1) - 1,
1148 &dummy,
1149 in_range_label,
1150 &dummy);
1151 ASSERT(!dummy.IsLinked());
1152 // Cut out the single range by rewriting the array. This creates a new
1153 // range that is a merger of the two ranges on either side of the one we
1154 // are cutting out. The oddity of the labels is preserved.
1155 for (intptr_t j = cut_index; j > start_index; j--) {
1156 (*ranges)[j] = ranges->At(j - 1);
1157 }
1158 for (intptr_t j = cut_index + 1; j < end_index; j++) {
1159 (*ranges)[j] = ranges->At(j + 1);
1160 }
1161 }
1162
1163
1164 // Unicode case. Split the search space into kSize spaces that are handled
1165 // with recursion.
1166 static void SplitSearchSpace(ZoneGrowableArray<int>* ranges,
1167 intptr_t start_index,
1168 intptr_t end_index,
1169 intptr_t* new_start_index,
1170 intptr_t* new_end_index,
1171 intptr_t* border) {
1172 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;
1173 static const intptr_t kMask = RegExpMacroAssembler::kTableMask;
1174
1175 intptr_t first = ranges->At(start_index);
1176 intptr_t last = ranges->At(end_index) - 1;
1177
1178 *new_start_index = start_index;
1179 *border = (ranges->At(start_index) & ~kMask) + kSize;
1180 while (*new_start_index < end_index) {
1181 if (ranges->At(*new_start_index) > *border) break;
1182 (*new_start_index)++;
1183 }
1184 // new_start_index is the index of the first edge that is beyond the
1185 // current kSize space.
1186
1187 // For very large search spaces we do a binary chop search of the non-Latin1
1188 // space instead of just going to the end of the current kSize space. The
1189 // heuristics are complicated a little by the fact that any 128-character
1190 // encoding space can be quickly tested with a table lookup, so we don't
1191 // wish to do binary chop search at a smaller granularity than that. A
1192 // 128-character space can take up a lot of space in the ranges array if,
1193 // for example, we only want to match every second character (eg. the lower
1194 // case characters on some Unicode pages).
1195 intptr_t binary_chop_index = (end_index + start_index) / 2;
1196 // The first test ensures that we get to the code that handles the Latin1
1197 // range with a single not-taken branch, speeding up this important
1198 // character range (even non-Latin1 charset-based text has spaces and
1199 // punctuation).
1200 if (*border - 1 > Symbols::kMaxOneCharCodeSymbol && // Latin1 case.
1201 end_index - start_index > (*new_start_index - start_index) * 2 &&
1202 last - first > kSize * 2 &&
1203 binary_chop_index > *new_start_index &&
1204 ranges->At(binary_chop_index) >= first + 2 * kSize) {
1205 intptr_t scan_forward_for_section_border = binary_chop_index;;
1206 intptr_t new_border = (ranges->At(binary_chop_index) | kMask) + 1;
1207
1208 while (scan_forward_for_section_border < end_index) {
1209 if (ranges->At(scan_forward_for_section_border) > new_border) {
1210 *new_start_index = scan_forward_for_section_border;
1211 *border = new_border;
1212 break;
1213 }
1214 scan_forward_for_section_border++;
1215 }
1216 }
1217
1218 ASSERT(*new_start_index > start_index);
1219 *new_end_index = *new_start_index - 1;
1220 if (ranges->At(*new_end_index) == *border) {
1221 (*new_end_index)--;
1222 }
1223 if (*border >= ranges->At(end_index)) {
1224 *border = ranges->At(end_index);
1225 *new_start_index = end_index; // Won't be used.
1226 *new_end_index = end_index - 1;
1227 }
1228 }
1229
1230
1231 // Gets a series of segment boundaries representing a character class. If the
1232 // character is in the range between an even and an odd boundary (counting from
1233 // start_index) then go to even_label, otherwise go to odd_label. We already
1234 // know that the character is in the range of min_char to max_char inclusive.
1235 // Either label can be NULL indicating backtracking. Either label can also be
1236 // equal to the fall_through label.
1237 static void GenerateBranches(RegExpMacroAssembler* masm,
1238 ZoneGrowableArray<int>* ranges,
1239 intptr_t start_index,
1240 intptr_t end_index,
1241 uint16_t min_char,
1242 uint16_t max_char,
1243 BlockLabel* fall_through,
1244 BlockLabel* even_label,
1245 BlockLabel* odd_label) {
1246 intptr_t first = ranges->At(start_index);
1247 intptr_t last = ranges->At(end_index) - 1;
1248
1249 ASSERT(min_char < first);
1250
1251 // Just need to test if the character is before or on-or-after
1252 // a particular character.
1253 if (start_index == end_index) {
1254 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1255 return;
1256 }
1257
1258 // Another almost trivial case: There is one interval in the middle that is
1259 // different from the end intervals.
1260 if (start_index + 1 == end_index) {
1261 EmitDoubleBoundaryTest(
1262 masm, first, last, fall_through, even_label, odd_label);
1263 return;
1264 }
1265
1266 // It's not worth using table lookup if there are very few intervals in the
1267 // character class.
1268 if (end_index - start_index <= 6) {
1269 // It is faster to test for individual characters, so we look for those
1270 // first, then try arbitrary ranges in the second round.
1271 static intptr_t kNoCutIndex = -1;
1272 intptr_t cut = kNoCutIndex;
1273 for (intptr_t i = start_index; i < end_index; i++) {
1274 if (ranges->At(i) == ranges->At(i + 1) - 1) {
1275 cut = i;
1276 break;
1277 }
1278 }
1279 if (cut == kNoCutIndex) cut = start_index;
1280 CutOutRange(
1281 masm, ranges, start_index, end_index, cut, even_label, odd_label);
1282 ASSERT(end_index - start_index >= 2);
1283 GenerateBranches(masm,
1284 ranges,
1285 start_index + 1,
1286 end_index - 1,
1287 min_char,
1288 max_char,
1289 fall_through,
1290 even_label,
1291 odd_label);
1292 return;
1293 }
1294
1295 // If there are a lot of intervals in the regexp, then we will use tables to
1296 // determine whether the character is inside or outside the character class.
1297 static const intptr_t kBits = RegExpMacroAssembler::kTableSizeBits;
1298
1299 if ((max_char >> kBits) == (min_char >> kBits)) {
1300 EmitUseLookupTable(masm,
1301 ranges,
1302 start_index,
1303 end_index,
1304 min_char,
1305 fall_through,
1306 even_label,
1307 odd_label);
1308 return;
1309 }
1310
1311 if ((min_char >> kBits) != (first >> kBits)) {
1312 masm->CheckCharacterLT(first, odd_label);
1313 GenerateBranches(masm,
1314 ranges,
1315 start_index + 1,
1316 end_index,
1317 first,
1318 max_char,
1319 fall_through,
1320 odd_label,
1321 even_label);
1322 return;
1323 }
1324
1325 intptr_t new_start_index = 0;
1326 intptr_t new_end_index = 0;
1327 intptr_t border = 0;
1328
1329 SplitSearchSpace(ranges,
1330 start_index,
1331 end_index,
1332 &new_start_index,
1333 &new_end_index,
1334 &border);
1335
1336 BlockLabel handle_rest;
1337 BlockLabel* above = &handle_rest;
1338 if (border == last + 1) {
1339 // We didn't find any section that started after the limit, so everything
1340 // above the border is one of the terminal labels.
1341 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
1342 ASSERT(new_end_index == end_index - 1);
1343 }
1344
1345 ASSERT(start_index <= new_end_index);
1346 ASSERT(new_start_index <= end_index);
1347 ASSERT(start_index < new_start_index);
1348 ASSERT(new_end_index < end_index);
1349 ASSERT(new_end_index + 1 == new_start_index ||
1350 (new_end_index + 2 == new_start_index &&
1351 border == ranges->At(new_end_index + 1)));
1352 ASSERT(min_char < border - 1);
1353 ASSERT(border < max_char);
1354 ASSERT(ranges->At(new_end_index) < border);
1355 ASSERT(border < ranges->At(new_start_index) ||
1356 (border == ranges->At(new_start_index) &&
1357 new_start_index == end_index &&
1358 new_end_index == end_index - 1 &&
1359 border == last + 1));
1360 ASSERT(new_start_index == 0 || border >= ranges->At(new_start_index - 1));
1361
1362 masm->CheckCharacterGT(border - 1, above);
1363 BlockLabel dummy;
1364 GenerateBranches(masm,
1365 ranges,
1366 start_index,
1367 new_end_index,
1368 min_char,
1369 border - 1,
1370 &dummy,
1371 even_label,
1372 odd_label);
1373
1374 if (handle_rest.IsLinked()) {
1375 masm->BindBlock(&handle_rest);
1376 bool flip = (new_start_index & 1) != (start_index & 1);
1377 GenerateBranches(masm,
1378 ranges,
1379 new_start_index,
1380 end_index,
1381 border,
1382 max_char,
1383 &dummy,
1384 flip ? odd_label : even_label,
1385 flip ? even_label : odd_label);
1386 }
1387 }
1388
1389
1390 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1391 RegExpCharacterClass* cc,
1392 bool one_byte,
1393 BlockLabel* on_failure,
1394 intptr_t cp_offset,
1395 bool check_offset,
1396 bool preloaded,
1397 Isolate* isolate) {
1398 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
1399 if (!CharacterRange::IsCanonical(ranges)) {
1400 CharacterRange::Canonicalize(ranges);
1401 }
1402
1403 intptr_t max_char;
1404 if (one_byte) {
1405 max_char = Symbols::kMaxOneCharCodeSymbol;
1406 } else {
1407 max_char = Utf16::kMaxCodeUnit;
1408 }
1409
1410 intptr_t range_count = ranges->length();
1411
1412 intptr_t last_valid_range = range_count - 1;
1413 while (last_valid_range >= 0) {
1414 CharacterRange& range = (*ranges)[last_valid_range];
1415 if (range.from() <= max_char) {
1416 break;
1417 }
1418 last_valid_range--;
1419 }
1420
1421 if (last_valid_range < 0) {
1422 if (!cc->is_negated()) {
1423 macro_assembler->GoTo(on_failure);
1424 }
1425 if (check_offset) {
1426 macro_assembler->CheckPosition(cp_offset, on_failure);
1427 }
1428 return;
1429 }
1430
1431 if (last_valid_range == 0 &&
1432 ranges->At(0).IsEverything(max_char)) {
1433 if (cc->is_negated()) {
1434 macro_assembler->GoTo(on_failure);
1435 } else {
1436 // This is a common case hit by non-anchored expressions.
1437 if (check_offset) {
1438 macro_assembler->CheckPosition(cp_offset, on_failure);
1439 }
1440 }
1441 return;
1442 }
1443 if (last_valid_range == 0 &&
1444 !cc->is_negated() &&
1445 ranges->At(0).IsEverything(max_char)) {
1446 // This is a common case hit by non-anchored expressions.
1447 if (check_offset) {
1448 macro_assembler->CheckPosition(cp_offset, on_failure);
1449 }
1450 return;
1451 }
1452
1453 if (!preloaded) {
1454 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1455 }
1456
1457 if (cc->is_standard() &&
1458 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1459 on_failure)) {
1460 return;
1461 }
1462
1463
1464 // A new list with ascending entries. Each entry is a code unit
1465 // where there is a boundary between code units that are part of
1466 // the class and code units that are not. Normally we insert an
1467 // entry at zero which goes to the failure label, but if there
1468 // was already one there we fall through for success on that entry.
1469 // Subsequent entries have alternating meaning (success/failure).
1470 ZoneGrowableArray<int>* range_boundaries =
1471 new(isolate) ZoneGrowableArray<int>(last_valid_range);
1472
1473 bool zeroth_entry_is_failure = !cc->is_negated();
1474
1475 for (intptr_t i = 0; i <= last_valid_range; i++) {
1476 CharacterRange& range = (*ranges)[i];
1477 if (range.from() == 0) {
1478 ASSERT(i == 0);
1479 zeroth_entry_is_failure = !zeroth_entry_is_failure;
1480 } else {
1481 range_boundaries->Add(range.from());
1482 }
1483 range_boundaries->Add(range.to() + 1);
1484 }
1485 intptr_t end_index = range_boundaries->length() - 1;
1486 if (range_boundaries->At(end_index) > max_char) {
1487 end_index--;
1488 }
1489
1490 BlockLabel fall_through;
1491 GenerateBranches(macro_assembler,
1492 range_boundaries,
1493 0, // start_index.
1494 end_index,
1495 0, // min_char.
1496 max_char,
1497 &fall_through,
1498 zeroth_entry_is_failure ? &fall_through : on_failure,
1499 zeroth_entry_is_failure ? on_failure : &fall_through);
1500 macro_assembler->BindBlock(&fall_through);
1501 }
1502
1503
1504 RegExpNode::~RegExpNode() {
1505 }
1506
1507
1508 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1509 Trace* trace) {
1510 // If we are generating a greedy loop then don't stop and don't reuse code.
1511 if (trace->stop_node() != NULL) {
1512 return CONTINUE;
1513 }
1514
1515 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1516 if (trace->is_trivial()) {
1517 if (label_.IsBound()) {
1518 // We are being asked to generate a generic version, but that's already
1519 // been done so just go to it.
1520 macro_assembler->GoTo(&label_);
1521 return DONE;
1522 }
1523 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1524 // To avoid too deep recursion we push the node to the work queue and just
1525 // generate a goto here.
1526 compiler->AddWork(this);
1527 macro_assembler->GoTo(&label_);
1528 return DONE;
1529 }
1530 // Generate generic version of the node and bind the label for later use.
1531 macro_assembler->BindBlock(&label_);
1532 return CONTINUE;
1533 }
1534
1535 // We are being asked to make a non-generic version. Keep track of how many
1536 // non-generic versions we generate so as not to overdo it.
1537 trace_count_++;
1538 if (kRegexpOptimization &&
1539 trace_count_ < kMaxCopiesCodeGenerated &&
1540 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1541 return CONTINUE;
1542 }
1543
1544 // If we get here code has been generated for this node too many times or
1545 // recursion is too deep. Time to switch to a generic version. The code for
1546 // generic versions above can handle deep recursion properly.
1547 trace->Flush(compiler, this);
1548 return DONE;
1549 }
1550
1551
1552 intptr_t ActionNode::EatsAtLeast(intptr_t still_to_find,
1553 intptr_t budget,
1554 bool not_at_start) {
1555 if (budget <= 0) return 0;
1556 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
1557 return on_success()->EatsAtLeast(still_to_find,
1558 budget - 1,
1559 not_at_start);
1560 }
1561
1562
1563 void ActionNode::FillInBMInfo(intptr_t offset,
1564 intptr_t budget,
1565 BoyerMooreLookahead* bm,
1566 bool not_at_start) {
1567 if (action_type_ == BEGIN_SUBMATCH) {
1568 bm->SetRest(offset);
1569 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
1570 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
1571 }
1572 SaveBMInfo(bm, not_at_start, offset);
1573 }
1574
1575
1576 intptr_t AssertionNode::EatsAtLeast(intptr_t still_to_find,
1577 intptr_t budget,
1578 bool not_at_start) {
1579 if (budget <= 0) return 0;
1580 // If we know we are not at the start and we are asked "how many characters
1581 // will you match if you succeed?" then we can answer anything since false
1582 // implies false. So lets just return the max answer (still_to_find) since
1583 // that won't prevent us from preloading a lot of characters for the other
1584 // branches in the node graph.
1585 if (assertion_type() == AT_START && not_at_start) return still_to_find;
1586 return on_success()->EatsAtLeast(still_to_find,
1587 budget - 1,
1588 not_at_start);
1589 }
1590
1591
1592 void AssertionNode::FillInBMInfo(intptr_t offset,
1593 intptr_t budget,
1594 BoyerMooreLookahead* bm,
1595 bool not_at_start) {
1596 // Match the behaviour of EatsAtLeast on this node.
1597 if (assertion_type() == AT_START && not_at_start) return;
1598 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
1599 SaveBMInfo(bm, not_at_start, offset);
1600 }
1601
1602
1603 intptr_t BackReferenceNode::EatsAtLeast(intptr_t still_to_find,
1604 intptr_t budget,
1605 bool not_at_start) {
1606 if (budget <= 0) return 0;
1607 return on_success()->EatsAtLeast(still_to_find,
1608 budget - 1,
1609 not_at_start);
1610 }
1611
1612
1613 intptr_t TextNode::EatsAtLeast(intptr_t still_to_find,
1614 intptr_t budget,
1615 bool not_at_start) {
1616 intptr_t answer = Length();
1617 if (answer >= still_to_find) return answer;
1618 if (budget <= 0) return answer;
1619 // We are not at start after this node so we set the last argument to 'true'.
1620 return answer + on_success()->EatsAtLeast(still_to_find - answer,
1621 budget - 1,
1622 true);
1623 }
1624
1625
1626 intptr_t NegativeLookaheadChoiceNode::EatsAtLeast(intptr_t still_to_find,
1627 intptr_t budget,
1628 bool not_at_start) {
1629 if (budget <= 0) return 0;
1630 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1631 // afterwards.
1632 RegExpNode* node = (*alternatives_)[1].node();
1633 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
1634 }
1635
1636
1637 void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
1638 QuickCheckDetails* details,
1639 RegExpCompiler* compiler,
1640 intptr_t filled_in,
1641 bool not_at_start) {
1642 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1643 // afterwards.
1644 RegExpNode* node = (*alternatives_)[1].node();
1645 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1646 }
1647
1648
1649 intptr_t ChoiceNode::EatsAtLeastHelper(intptr_t still_to_find,
1650 intptr_t budget,
1651 RegExpNode* ignore_this_node,
1652 bool not_at_start) {
1653 if (budget <= 0) return 0;
1654 intptr_t min = 100;
1655 intptr_t choice_count = alternatives_->length();
1656 budget = (budget - 1) / choice_count;
1657 for (intptr_t i = 0; i < choice_count; i++) {
1658 RegExpNode* node = (*alternatives_)[i].node();
1659 if (node == ignore_this_node) continue;
1660 intptr_t node_eats_at_least =
1661 node->EatsAtLeast(still_to_find, budget, not_at_start);
1662 if (node_eats_at_least < min) min = node_eats_at_least;
1663 if (min == 0) return 0;
1664 }
1665 return min;
1666 }
1667
1668
1669 intptr_t LoopChoiceNode::EatsAtLeast(intptr_t still_to_find,
1670 intptr_t budget,
1671 bool not_at_start) {
1672 return EatsAtLeastHelper(still_to_find,
1673 budget - 1,
1674 loop_node_,
1675 not_at_start);
1676 }
1677
1678
1679 intptr_t ChoiceNode::EatsAtLeast(intptr_t still_to_find,
1680 intptr_t budget,
1681 bool not_at_start) {
1682 return EatsAtLeastHelper(still_to_find,
1683 budget,
1684 NULL,
1685 not_at_start);
1686 }
1687
1688
1689 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
1690 static inline uint32_t SmearBitsRight(uint32_t v) {
1691 v |= v >> 1;
1692 v |= v >> 2;
1693 v |= v >> 4;
1694 v |= v >> 8;
1695 v |= v >> 16;
1696 return v;
1697 }
1698
1699
1700 bool QuickCheckDetails::Rationalize(bool asc) {
1701 bool found_useful_op = false;
1702 uint32_t char_mask;
1703 if (asc) {
1704 char_mask = Symbols::kMaxOneCharCodeSymbol;
1705 } else {
1706 char_mask = Utf16::kMaxCodeUnit;
1707 }
1708 mask_ = 0;
1709 value_ = 0;
1710 intptr_t char_shift = 0;
1711 for (intptr_t i = 0; i < characters_; i++) {
1712 Position* pos = &positions_[i];
1713 if ((pos->mask & Symbols::kMaxOneCharCodeSymbol) != 0) {
1714 found_useful_op = true;
1715 }
1716 mask_ |= (pos->mask & char_mask) << char_shift;
1717 value_ |= (pos->value & char_mask) << char_shift;
1718 char_shift += asc ? 8 : 16;
1719 }
1720 return found_useful_op;
1721 }
1722
1723
1724 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1725 Trace* bounds_check_trace,
1726 Trace* trace,
1727 bool preload_has_checked_bounds,
1728 BlockLabel* on_possible_success,
1729 QuickCheckDetails* details,
1730 bool fall_through_on_failure) {
1731 if (details->characters() == 0) return false;
1732 GetQuickCheckDetails(
1733 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
1734 if (details->cannot_match()) return false;
1735 if (!details->Rationalize(compiler->one_byte())) return false;
1736 ASSERT(details->characters() == 1 ||
1737 compiler->macro_assembler()->CanReadUnaligned());
1738 uint32_t mask = details->mask();
1739 uint32_t value = details->value();
1740
1741 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1742
1743 if (trace->characters_preloaded() != details->characters()) {
1744 ASSERT(trace->cp_offset() == bounds_check_trace->cp_offset());
1745 // We are attempting to preload the minimum number of characters
1746 // any choice would eat, so if the bounds check fails, then none of the
1747 // choices can succeed, so we can just immediately backtrack, rather
1748 // than go to the next choice.
1749 assembler->LoadCurrentCharacter(trace->cp_offset(),
1750 bounds_check_trace->backtrack(),
1751 !preload_has_checked_bounds,
1752 details->characters());
1753 }
1754
1755
1756 bool need_mask = true;
1757
1758 if (details->characters() == 1) {
1759 // If number of characters preloaded is 1 then we used a byte or 16 bit
1760 // load so the value is already masked down.
1761 uint32_t char_mask;
1762 if (compiler->one_byte()) {
1763 char_mask = Symbols::kMaxOneCharCodeSymbol;
1764 } else {
1765 char_mask = Utf16::kMaxCodeUnit;
1766 }
1767 if ((mask & char_mask) == char_mask) need_mask = false;
1768 mask &= char_mask;
1769 } else {
1770 // For 2-character preloads in one-byte mode or 1-character preloads in
1771 // two-byte mode we also use a 16 bit load with zero extend.
1772 if (details->characters() == 2 && compiler->one_byte()) {
1773 if ((mask & 0xffff) == 0xffff) need_mask = false;
1774 } else if (details->characters() == 1 && !compiler->one_byte()) {
1775 if ((mask & 0xffff) == 0xffff) need_mask = false;
1776 } else {
1777 if (mask == 0xffffffff) need_mask = false;
1778 }
1779 }
1780
1781 if (fall_through_on_failure) {
1782 if (need_mask) {
1783 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1784 } else {
1785 assembler->CheckCharacter(value, on_possible_success);
1786 }
1787 } else {
1788 if (need_mask) {
1789 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1790 } else {
1791 assembler->CheckNotCharacter(value, trace->backtrack());
1792 }
1793 }
1794 return true;
1795 }
1796
1797
1798 // Here is the meat of GetQuickCheckDetails (see also the comment on the
1799 // super-class in the .h file).
1800 //
1801 // We iterate along the text object, building up for each character a
1802 // mask and value that can be used to test for a quick failure to match.
1803 // The masks and values for the positions will be combined into a single
1804 // machine word for the current character width in order to be used in
1805 // generating a quick check.
1806 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1807 RegExpCompiler* compiler,
1808 intptr_t characters_filled_in,
1809 bool not_at_start) {
1810 ASSERT(characters_filled_in < details->characters());
1811 intptr_t characters = details->characters();
1812 intptr_t char_mask;
1813 if (compiler->one_byte()) {
1814 char_mask = Symbols::kMaxOneCharCodeSymbol;
1815 } else {
1816 char_mask = Utf16::kMaxCodeUnit;
1817 }
1818 for (intptr_t k = 0; k < elms_->length(); k++) {
1819 TextElement elm = elms_->At(k);
1820 if (elm.text_type() == TextElement::ATOM) {
1821 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
1822 for (intptr_t i = 0; i < characters && i < quarks->length(); i++) {
1823 QuickCheckDetails::Position* pos =
1824 details->positions(characters_filled_in);
1825 uint16_t c = quarks->At(i);
1826 if (c > char_mask) {
1827 // If we expect a non-Latin1 character from an one-byte string,
1828 // there is no way we can match. Not even case independent
1829 // matching can turn an Latin1 character into non-Latin1 or
1830 // vice versa.
1831 // TODO(dcarney): issue 3550. Verify that this works as expected.
1832 // For example, \u0178 is uppercase of \u00ff (y-umlaut).
1833 details->set_cannot_match();
1834 pos->determines_perfectly = false;
1835 return;
1836 }
1837 if (compiler->ignore_case()) {
1838 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1839 intptr_t length =
1840 GetCaseIndependentLetters(c, compiler->one_byte(), chars);
1841 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
1842 if (length == 1) {
1843 // This letter has no case equivalents, so it's nice and simple
1844 // and the mask-compare will determine definitely whether we have
1845 // a match at this character position.
1846 pos->mask = char_mask;
1847 pos->value = c;
1848 pos->determines_perfectly = true;
1849 } else {
1850 uint32_t common_bits = char_mask;
1851 uint32_t bits = chars[0];
1852 for (intptr_t j = 1; j < length; j++) {
1853 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1854 common_bits ^= differing_bits;
1855 bits &= common_bits;
1856 }
1857 // If length is 2 and common bits has only one zero in it then
1858 // our mask and compare instruction will determine definitely
1859 // whether we have a match at this character position. Otherwise
1860 // it can only be an approximate check.
1861 uint32_t one_zero = (common_bits | ~char_mask);
1862 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1863 pos->determines_perfectly = true;
1864 }
1865 pos->mask = common_bits;
1866 pos->value = bits;
1867 }
1868 } else {
1869 // Don't ignore case. Nice simple case where the mask-compare will
1870 // determine definitely whether we have a match at this character
1871 // position.
1872 pos->mask = char_mask;
1873 pos->value = c;
1874 pos->determines_perfectly = true;
1875 }
1876 characters_filled_in++;
1877 ASSERT(characters_filled_in <= details->characters());
1878 if (characters_filled_in == details->characters()) {
1879 return;
1880 }
1881 }
1882 } else {
1883 QuickCheckDetails::Position* pos =
1884 details->positions(characters_filled_in);
1885 RegExpCharacterClass* tree = elm.char_class();
1886 ZoneGrowableArray<CharacterRange>* ranges = tree->ranges();
1887 if (tree->is_negated()) {
1888 // A quick check uses multi-character mask and compare. There is no
1889 // useful way to incorporate a negative char class into this scheme
1890 // so we just conservatively create a mask and value that will always
1891 // succeed.
1892 pos->mask = 0;
1893 pos->value = 0;
1894 } else {
1895 intptr_t first_range = 0;
1896 while (ranges->At(first_range).from() > char_mask) {
1897 first_range++;
1898 if (first_range == ranges->length()) {
1899 details->set_cannot_match();
1900 pos->determines_perfectly = false;
1901 return;
1902 }
1903 }
1904 CharacterRange range = ranges->At(first_range);
1905 uint16_t from = range.from();
1906 uint16_t to = range.to();
1907 if (to > char_mask) {
1908 to = char_mask;
1909 }
1910 uint32_t differing_bits = (from ^ to);
1911 // A mask and compare is only perfect if the differing bits form a
1912 // number like 00011111 with one single block of trailing 1s.
1913 if ((differing_bits & (differing_bits + 1)) == 0 &&
1914 from + differing_bits == to) {
1915 pos->determines_perfectly = true;
1916 }
1917 uint32_t common_bits = ~SmearBitsRight(differing_bits);
1918 uint32_t bits = (from & common_bits);
1919 for (intptr_t i = first_range + 1; i < ranges->length(); i++) {
1920 CharacterRange range = ranges->At(i);
1921 uint16_t from = range.from();
1922 uint16_t to = range.to();
1923 if (from > char_mask) continue;
1924 if (to > char_mask) to = char_mask;
1925 // Here we are combining more ranges into the mask and compare
1926 // value. With each new range the mask becomes more sparse and
1927 // so the chances of a false positive rise. A character class
1928 // with multiple ranges is assumed never to be equivalent to a
1929 // mask and compare operation.
1930 pos->determines_perfectly = false;
1931 uint32_t new_common_bits = (from ^ to);
1932 new_common_bits = ~SmearBitsRight(new_common_bits);
1933 common_bits &= new_common_bits;
1934 bits &= new_common_bits;
1935 uint32_t differing_bits = (from & common_bits) ^ bits;
1936 common_bits ^= differing_bits;
1937 bits &= common_bits;
1938 }
1939 pos->mask = common_bits;
1940 pos->value = bits;
1941 }
1942 characters_filled_in++;
1943 ASSERT(characters_filled_in <= details->characters());
1944 if (characters_filled_in == details->characters()) {
1945 return;
1946 }
1947 }
1948 }
1949 ASSERT(characters_filled_in != details->characters());
1950 if (!details->cannot_match()) {
1951 on_success()-> GetQuickCheckDetails(details,
1952 compiler,
1953 characters_filled_in,
1954 true);
1955 }
1956 }
1957
1958
1959 void QuickCheckDetails::Clear() {
1960 for (int i = 0; i < characters_; i++) {
1961 positions_[i].mask = 0;
1962 positions_[i].value = 0;
1963 positions_[i].determines_perfectly = false;
1964 }
1965 characters_ = 0;
1966 }
1967
1968
1969 void QuickCheckDetails::Advance(intptr_t by, bool one_byte) {
1970 ASSERT(by >= 0);
1971 if (by >= characters_) {
1972 Clear();
1973 return;
1974 }
1975 for (intptr_t i = 0; i < characters_ - by; i++) {
1976 positions_[i] = positions_[by + i];
1977 }
1978 for (intptr_t i = characters_ - by; i < characters_; i++) {
1979 positions_[i].mask = 0;
1980 positions_[i].value = 0;
1981 positions_[i].determines_perfectly = false;
1982 }
1983 characters_ -= by;
1984 // We could change mask_ and value_ here but we would never advance unless
1985 // they had already been used in a check and they won't be used again because
1986 // it would gain us nothing. So there's no point.
1987 }
1988
1989
1990 void QuickCheckDetails::Merge(QuickCheckDetails* other, intptr_t from_index) {
1991 ASSERT(characters_ == other->characters_);
1992 if (other->cannot_match_) {
1993 return;
1994 }
1995 if (cannot_match_) {
1996 *this = *other;
1997 return;
1998 }
1999 for (intptr_t i = from_index; i < characters_; i++) {
2000 QuickCheckDetails::Position* pos = positions(i);
2001 QuickCheckDetails::Position* other_pos = other->positions(i);
2002 if (pos->mask != other_pos->mask ||
2003 pos->value != other_pos->value ||
2004 !other_pos->determines_perfectly) {
2005 // Our mask-compare operation will be approximate unless we have the
2006 // exact same operation on both sides of the alternation.
2007 pos->determines_perfectly = false;
2008 }
2009 pos->mask &= other_pos->mask;
2010 pos->value &= pos->mask;
2011 other_pos->value &= pos->mask;
2012 uint16_t differing_bits = (pos->value ^ other_pos->value);
2013 pos->mask &= ~differing_bits;
2014 pos->value &= pos->mask;
2015 }
2016 }
2017
2018
2019 class VisitMarker : public ValueObject {
2020 public:
2021 explicit VisitMarker(NodeInfo* info) : info_(info) {
2022 ASSERT(!info->visited);
2023 info->visited = true;
2024 }
2025 ~VisitMarker() {
2026 info_->visited = false;
2027 }
2028 private:
2029 NodeInfo* info_;
2030 };
2031
2032
2033 RegExpNode* SeqRegExpNode::FilterOneByte(intptr_t depth, bool ignore_case) {
2034 if (info()->replacement_calculated) return replacement();
2035 if (depth < 0) return this;
2036 ASSERT(!info()->visited);
2037 VisitMarker marker(info());
2038 return FilterSuccessor(depth - 1, ignore_case);
2039 }
2040
2041
2042 RegExpNode* SeqRegExpNode::FilterSuccessor(intptr_t depth, bool ignore_case) {
2043 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2044 if (next == NULL) return set_replacement(NULL);
2045 on_success_ = next;
2046 return set_replacement(this);
2047 }
2048
2049
2050 // We need to check for the following characters: 0x39c 0x3bc 0x178.
2051 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2052 // TODO(dcarney): this could be a lot more efficient.
2053 return range.Contains(0x39c) ||
2054 range.Contains(0x3bc) || range.Contains(0x178);
2055 }
2056
2057
2058 static bool RangesContainLatin1Equivalents(
2059 ZoneGrowableArray<CharacterRange>* ranges) {
2060 for (intptr_t i = 0; i < ranges->length(); i++) {
2061 // TODO(dcarney): this could be a lot more efficient.
2062 if (RangeContainsLatin1Equivalents(ranges->At(i))) return true;
2063 }
2064 return false;
2065 }
2066
2067
2068 static uint16_t ConvertNonLatin1ToLatin1(uint16_t c) {
2069 ASSERT(c > Symbols::kMaxOneCharCodeSymbol);
2070 switch (c) {
2071 // This are equivalent characters in unicode.
2072 case 0x39c:
2073 case 0x3bc:
2074 return 0xb5;
2075 // This is an uppercase of a Latin-1 character
2076 // outside of Latin-1.
2077 case 0x178:
2078 return 0xff;
2079 }
2080 return 0;
2081 }
2082
2083
2084 RegExpNode* TextNode::FilterOneByte(intptr_t depth, bool ignore_case) {
2085 if (info()->replacement_calculated) return replacement();
2086 if (depth < 0) return this;
2087 ASSERT(!info()->visited);
2088 VisitMarker marker(info());
2089 intptr_t element_count = elms_->length();
2090 for (intptr_t i = 0; i < element_count; i++) {
2091 TextElement elm = elms_->At(i);
2092 if (elm.text_type() == TextElement::ATOM) {
2093 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
2094 for (intptr_t j = 0; j < quarks->length(); j++) {
2095 uint16_t c = quarks->At(j);
2096 if (c <= Symbols::kMaxOneCharCodeSymbol) continue;
2097 if (!ignore_case) return set_replacement(NULL);
2098 // Here, we need to check for characters whose upper and lower cases
2099 // are outside the Latin-1 range.
2100 uint16_t converted = ConvertNonLatin1ToLatin1(c);
2101 // Character is outside Latin-1 completely
2102 if (converted == 0) return set_replacement(NULL);
2103 // Convert quark to Latin-1 in place.
2104 (*quarks)[0] = converted;
2105 }
2106 } else {
2107 ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
2108 RegExpCharacterClass* cc = elm.char_class();
2109 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
2110 if (!CharacterRange::IsCanonical(ranges)) {
2111 CharacterRange::Canonicalize(ranges);
2112 }
2113 // Now they are in order so we only need to look at the first.
2114 intptr_t range_count = ranges->length();
2115 if (cc->is_negated()) {
2116 if (range_count != 0 &&
2117 ranges->At(0).from() == 0 &&
2118 ranges->At(0).to() >= Symbols::kMaxOneCharCodeSymbol) {
2119 // This will be handled in a later filter.
2120 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2121 return set_replacement(NULL);
2122 }
2123 } else {
2124 if (range_count == 0 ||
2125 ranges->At(0).from() > Symbols::kMaxOneCharCodeSymbol) {
2126 // This will be handled in a later filter.
2127 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2128 return set_replacement(NULL);
2129 }
2130 }
2131 }
2132 }
2133 return FilterSuccessor(depth - 1, ignore_case);
2134 }
2135
2136
2137 RegExpNode* LoopChoiceNode::FilterOneByte(intptr_t depth, bool ignore_case) {
2138 if (info()->replacement_calculated) return replacement();
2139 if (depth < 0) return this;
2140 if (info()->visited) return this;
2141 {
2142 VisitMarker marker(info());
2143
2144 RegExpNode* continue_replacement =
2145 continue_node_->FilterOneByte(depth - 1, ignore_case);
2146 // If we can't continue after the loop then there is no sense in doing the
2147 // loop.
2148 if (continue_replacement == NULL) return set_replacement(NULL);
2149 }
2150
2151 return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2152 }
2153
2154
2155 RegExpNode* ChoiceNode::FilterOneByte(intptr_t depth, bool ignore_case) {
2156 if (info()->replacement_calculated) return replacement();
2157 if (depth < 0) return this;
2158 if (info()->visited) return this;
2159 VisitMarker marker(info());
2160 intptr_t choice_count = alternatives_->length();
2161
2162 for (intptr_t i = 0; i < choice_count; i++) {
2163 GuardedAlternative alternative = alternatives_->At(i);
2164 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2165 set_replacement(this);
2166 return this;
2167 }
2168 }
2169
2170 intptr_t surviving = 0;
2171 RegExpNode* survivor = NULL;
2172 for (intptr_t i = 0; i < choice_count; i++) {
2173 GuardedAlternative alternative = alternatives_->At(i);
2174 RegExpNode* replacement =
2175 alternative.node()->FilterOneByte(depth - 1, ignore_case);
2176 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
2177 if (replacement != NULL) {
2178 (*alternatives_)[i].set_node(replacement);
2179 surviving++;
2180 survivor = replacement;
2181 }
2182 }
2183 if (surviving < 2) return set_replacement(survivor);
2184
2185 set_replacement(this);
2186 if (surviving == choice_count) {
2187 return this;
2188 }
2189 // Only some of the nodes survived the filtering. We need to rebuild the
2190 // alternatives list.
2191 ZoneGrowableArray<GuardedAlternative>* new_alternatives =
2192 new(I) ZoneGrowableArray<GuardedAlternative>(surviving);
2193 for (intptr_t i = 0; i < choice_count; i++) {
2194 RegExpNode* replacement =
2195 (*alternatives_)[i].node()->FilterOneByte(depth - 1, ignore_case);
2196 if (replacement != NULL) {
2197 (*alternatives_)[i].set_node(replacement);
2198 new_alternatives->Add((*alternatives_)[i]);
2199 }
2200 }
2201 alternatives_ = new_alternatives;
2202 return this;
2203 }
2204
2205
2206 RegExpNode* NegativeLookaheadChoiceNode::FilterOneByte(intptr_t depth,
2207 bool ignore_case) {
2208 if (info()->replacement_calculated) return replacement();
2209 if (depth < 0) return this;
2210 if (info()->visited) return this;
2211 VisitMarker marker(info());
2212 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2213 // afterwards.
2214 RegExpNode* node = (*alternatives_)[1].node();
2215 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2216 if (replacement == NULL) return set_replacement(NULL);
2217 (*alternatives_)[1].set_node(replacement);
2218
2219 RegExpNode* neg_node = (*alternatives_)[0].node();
2220 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2221 // If the negative lookahead is always going to fail then
2222 // we don't need to check it.
2223 if (neg_replacement == NULL) return set_replacement(replacement);
2224 (*alternatives_)[0].set_node(neg_replacement);
2225 return set_replacement(this);
2226 }
2227
2228
2229 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2230 RegExpCompiler* compiler,
2231 intptr_t characters_filled_in,
2232 bool not_at_start) {
2233 if (body_can_be_zero_length_ || info()->visited) return;
2234 VisitMarker marker(info());
2235 return ChoiceNode::GetQuickCheckDetails(details,
2236 compiler,
2237 characters_filled_in,
2238 not_at_start);
2239 }
2240
2241
2242 void LoopChoiceNode::FillInBMInfo(intptr_t offset,
2243 intptr_t budget,
2244 BoyerMooreLookahead* bm,
2245 bool not_at_start) {
2246 if (body_can_be_zero_length_ || budget <= 0) {
2247 bm->SetRest(offset);
2248 SaveBMInfo(bm, not_at_start, offset);
2249 return;
2250 }
2251 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start);
2252 SaveBMInfo(bm, not_at_start, offset);
2253 }
2254
2255
2256 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2257 RegExpCompiler* compiler,
2258 intptr_t characters_filled_in,
2259 bool not_at_start) {
2260 not_at_start = (not_at_start || not_at_start_);
2261 intptr_t choice_count = alternatives_->length();
2262 ASSERT(choice_count > 0);
2263 (*alternatives_)[0].node()->GetQuickCheckDetails(details,
2264 compiler,
2265 characters_filled_in,
2266 not_at_start);
2267 for (intptr_t i = 1; i < choice_count; i++) {
2268 QuickCheckDetails new_details(details->characters());
2269 RegExpNode* node = (*alternatives_)[i].node();
2270 node->GetQuickCheckDetails(&new_details, compiler,
2271 characters_filled_in,
2272 not_at_start);
2273 // Here we merge the quick match details of the two branches.
2274 details->Merge(&new_details, characters_filled_in);
2275 }
2276 }
2277
2278
2279 // Check for [0-9A-Z_a-z].
2280 static void EmitWordCheck(RegExpMacroAssembler* assembler,
2281 BlockLabel* word,
2282 BlockLabel* non_word,
2283 bool fall_through_on_word) {
2284 if (assembler->CheckSpecialCharacterClass(
2285 fall_through_on_word ? 'w' : 'W',
2286 fall_through_on_word ? non_word : word)) {
2287 // Optimized implementation available.
2288 return;
2289 }
2290 assembler->CheckCharacterGT('z', non_word);
2291 assembler->CheckCharacterLT('0', non_word);
2292 assembler->CheckCharacterGT('a' - 1, word);
2293 assembler->CheckCharacterLT('9' + 1, word);
2294 assembler->CheckCharacterLT('A', non_word);
2295 assembler->CheckCharacterLT('Z' + 1, word);
2296 if (fall_through_on_word) {
2297 assembler->CheckNotCharacter('_', non_word);
2298 } else {
2299 assembler->CheckCharacter('_', word);
2300 }
2301 }
2302
2303
2304 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2305 // that matches newline or the start of input).
2306 static void EmitHat(RegExpCompiler* compiler,
2307 RegExpNode* on_success,
2308 Trace* trace) {
2309 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2310 // We will be loading the previous character into the current character
2311 // register.
2312 Trace new_trace(*trace);
2313 new_trace.InvalidateCurrentCharacter();
2314
2315 BlockLabel ok;
2316 if (new_trace.cp_offset() == 0) {
2317 // The start of input counts as a newline in this context, so skip to
2318 // ok if we are at the start.
2319 assembler->CheckAtStart(&ok);
2320 }
2321 // We already checked that we are not at the start of input so it must be
2322 // OK to load the previous character.
2323 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2324 new_trace.backtrack(),
2325 false);
2326 if (!assembler->CheckSpecialCharacterClass('n',
2327 new_trace.backtrack())) {
2328 // Newline means \n, \r, 0x2028 or 0x2029.
2329 if (!compiler->one_byte()) {
2330 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2331 }
2332 assembler->CheckCharacter('\n', &ok);
2333 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2334 }
2335 assembler->BindBlock(&ok);
2336 on_success->Emit(compiler, &new_trace);
2337 }
2338
2339
2340 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2341 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2342 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2343 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2344 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2345 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2346 if (lookahead == NULL) {
2347 intptr_t eats_at_least =
2348 Utils::Minimum(kMaxLookaheadForBoyerMoore,
2349 EatsAtLeast(kMaxLookaheadForBoyerMoore,
2350 kRecursionBudget,
2351 not_at_start));
2352 if (eats_at_least >= 1) {
2353 BoyerMooreLookahead* bm =
2354 new(I) BoyerMooreLookahead(eats_at_least, compiler, I);
2355 FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
2356 if (bm->at(0)->is_non_word())
2357 next_is_word_character = Trace::FALSE_VALUE;
2358 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2359 }
2360 } else {
2361 if (lookahead->at(0)->is_non_word())
2362 next_is_word_character = Trace::FALSE_VALUE;
2363 if (lookahead->at(0)->is_word())
2364 next_is_word_character = Trace::TRUE_VALUE;
2365 }
2366 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
2367 if (next_is_word_character == Trace::UNKNOWN) {
2368 BlockLabel before_non_word;
2369 BlockLabel before_word;
2370 if (trace->characters_preloaded() != 1) {
2371 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2372 }
2373 // Fall through on non-word.
2374 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2375 // Next character is not a word character.
2376 assembler->BindBlock(&before_non_word);
2377 BlockLabel ok;
2378 // Backtrack on \B (non-boundary check) if previous is a word,
2379 // since we know next *is not* a word and this would be a boundary.
2380 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2381
2382 if (!assembler->IsClosed()) {
2383 assembler->GoTo(&ok);
2384 }
2385
2386 assembler->BindBlock(&before_word);
2387 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2388 assembler->BindBlock(&ok);
2389 } else if (next_is_word_character == Trace::TRUE_VALUE) {
2390 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2391 } else {
2392 ASSERT(next_is_word_character == Trace::FALSE_VALUE);
2393 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2394 }
2395 }
2396
2397
2398 void AssertionNode::BacktrackIfPrevious(
2399 RegExpCompiler* compiler,
2400 Trace* trace,
2401 AssertionNode::IfPrevious backtrack_if_previous) {
2402 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2403 Trace new_trace(*trace);
2404 new_trace.InvalidateCurrentCharacter();
2405
2406 BlockLabel fall_through, dummy;
2407
2408 BlockLabel* non_word = backtrack_if_previous == kIsNonWord ?
2409 new_trace.backtrack() :
2410 &fall_through;
2411 BlockLabel* word = backtrack_if_previous == kIsNonWord ?
2412 &fall_through :
2413 new_trace.backtrack();
2414
2415 if (new_trace.cp_offset() == 0) {
2416 // The start of input counts as a non-word character, so the question is
2417 // decided if we are at the start.
2418 assembler->CheckAtStart(non_word);
2419 }
2420 // We already checked that we are not at the start of input so it must be
2421 // OK to load the previous character.
2422 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
2423 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
2424
2425 assembler->BindBlock(&fall_through);
2426 on_success()->Emit(compiler, &new_trace);
2427 }
2428
2429
2430 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2431 RegExpCompiler* compiler,
2432 intptr_t filled_in,
2433 bool not_at_start) {
2434 if (assertion_type_ == AT_START && not_at_start) {
2435 details->set_cannot_match();
2436 return;
2437 }
2438 return on_success()->GetQuickCheckDetails(details,
2439 compiler,
2440 filled_in,
2441 not_at_start);
2442 }
2443
2444
2445 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2446 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2447 switch (assertion_type_) {
2448 case AT_END: {
2449 BlockLabel ok;
2450 assembler->CheckPosition(trace->cp_offset(), &ok);
2451 assembler->GoTo(trace->backtrack());
2452 assembler->BindBlock(&ok);
2453 break;
2454 }
2455 case AT_START: {
2456 if (trace->at_start() == Trace::FALSE_VALUE) {
2457 assembler->GoTo(trace->backtrack());
2458 return;
2459 }
2460 if (trace->at_start() == Trace::UNKNOWN) {
2461 assembler->CheckNotAtStart(trace->backtrack());
2462 Trace at_start_trace = *trace;
2463 at_start_trace.set_at_start(true);
2464 on_success()->Emit(compiler, &at_start_trace);
2465 return;
2466 }
2467 }
2468 break;
2469 case AFTER_NEWLINE:
2470 EmitHat(compiler, on_success(), trace);
2471 return;
2472 case AT_BOUNDARY:
2473 case AT_NON_BOUNDARY: {
2474 EmitBoundaryCheck(compiler, trace);
2475 return;
2476 }
2477 }
2478 on_success()->Emit(compiler, trace);
2479 }
2480
2481
2482 static bool DeterminedAlready(QuickCheckDetails* quick_check, intptr_t offset) {
2483 if (quick_check == NULL) return false;
2484 if (offset >= quick_check->characters()) return false;
2485 return quick_check->positions(offset)->determines_perfectly;
2486 }
2487
2488
2489 static void UpdateBoundsCheck(intptr_t index, intptr_t* checked_up_to) {
2490 if (index > *checked_up_to) {
2491 *checked_up_to = index;
2492 }
2493 }
2494
2495
2496 // We call this repeatedly to generate code for each pass over the text node.
2497 // The passes are in increasing order of difficulty because we hope one
2498 // of the first passes will fail in which case we are saved the work of the
2499 // later passes. for example for the case independent regexp /%[asdfghjkl]a/
2500 // we will check the '%' in the first pass, the case independent 'a' in the
2501 // second pass and the character class in the last pass.
2502 //
2503 // The passes are done from right to left, so for example to test for /bar/
2504 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
2505 // and then a 'b' with offset 0. This means we can avoid the end-of-input
2506 // bounds check most of the time. In the example we only need to check for
2507 // end-of-input when loading the putative 'r'.
2508 //
2509 // A slight complication involves the fact that the first character may already
2510 // be fetched into a register by the previous node. In this case we want to
2511 // do the test for that character first. We do this in separate passes. The
2512 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2513 // pass has been performed then subsequent passes will have true in
2514 // first_element_checked to indicate that that character does not need to be
2515 // checked again.
2516 //
2517 // In addition to all this we are passed a Trace, which can
2518 // contain an AlternativeGeneration object. In this AlternativeGeneration
2519 // object we can see details of any quick check that was already passed in
2520 // order to get to the code we are now generating. The quick check can involve
2521 // loading characters, which means we do not need to recheck the bounds
2522 // up to the limit the quick check already checked. In addition the quick
2523 // check can have involved a mask and compare operation which may simplify
2524 // or obviate the need for further checks at some character positions.
2525 void TextNode::TextEmitPass(RegExpCompiler* compiler,
2526 TextEmitPassType pass,
2527 bool preloaded,
2528 Trace* trace,
2529 bool first_element_checked,
2530 intptr_t* checked_up_to) {
2531 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2532 bool one_byte = compiler->one_byte();
2533 BlockLabel* backtrack = trace->backtrack();
2534 QuickCheckDetails* quick_check = trace->quick_check_performed();
2535 intptr_t element_count = elms_->length();
2536 for (intptr_t i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2537 TextElement elm = elms_->At(i);
2538 intptr_t cp_offset = trace->cp_offset() + elm.cp_offset();
2539 if (elm.text_type() == TextElement::ATOM) {
2540 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
2541 for (intptr_t j = preloaded ? 0 : quarks->length() - 1; j >= 0; j--) {
2542 if (first_element_checked && i == 0 && j == 0) continue;
2543 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
2544 EmitCharacterFunction* emit_function = NULL;
2545 switch (pass) {
2546 case NON_LATIN1_MATCH:
2547 ASSERT(one_byte);
2548 if (quarks->At(j) > Symbols::kMaxOneCharCodeSymbol) {
2549 assembler->GoTo(backtrack);
2550 return;
2551 }
2552 break;
2553 case NON_LETTER_CHARACTER_MATCH:
2554 emit_function = &EmitAtomNonLetter;
2555 break;
2556 case SIMPLE_CHARACTER_MATCH:
2557 emit_function = &EmitSimpleCharacter;
2558 break;
2559 case CASE_CHARACTER_MATCH:
2560 emit_function = &EmitAtomLetter;
2561 break;
2562 default:
2563 break;
2564 }
2565 if (emit_function != NULL) {
2566 bool bound_checked = emit_function(I,
2567 compiler,
2568 quarks->At(j),
2569 backtrack,
2570 cp_offset + j,
2571 *checked_up_to < cp_offset + j,
2572 preloaded);
2573 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2574 }
2575 }
2576 } else {
2577 ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
2578 if (pass == CHARACTER_CLASS_MATCH) {
2579 if (first_element_checked && i == 0) continue;
2580 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
2581 RegExpCharacterClass* cc = elm.char_class();
2582 EmitCharClass(assembler,
2583 cc,
2584 one_byte,
2585 backtrack,
2586 cp_offset,
2587 *checked_up_to < cp_offset,
2588 preloaded,
2589 I);
2590 UpdateBoundsCheck(cp_offset, checked_up_to);
2591 }
2592 }
2593 }
2594 }
2595
2596
2597 intptr_t TextNode::Length() {
2598 TextElement elm = elms_->Last();
2599 ASSERT(elm.cp_offset() >= 0);
2600 return elm.cp_offset() + elm.length();
2601 }
2602
2603
2604 bool TextNode::SkipPass(intptr_t intptr_t_pass, bool ignore_case) {
2605 TextEmitPassType pass = static_cast<TextEmitPassType>(intptr_t_pass);
2606 if (ignore_case) {
2607 return pass == SIMPLE_CHARACTER_MATCH;
2608 } else {
2609 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2610 }
2611 }
2612
2613
2614 // This generates the code to match a text node. A text node can contain
2615 // straight character sequences (possibly to be matched in a case-independent
2616 // way) and character classes. For efficiency we do not do this in a single
2617 // pass from left to right. Instead we pass over the text node several times,
2618 // emitting code for some character positions every time. See the comment on
2619 // TextEmitPass for details.
2620 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2621 LimitResult limit_result = LimitVersions(compiler, trace);
2622 if (limit_result == DONE) return;
2623 ASSERT(limit_result == CONTINUE);
2624
2625 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2626 compiler->SetRegExpTooBig();
2627 return;
2628 }
2629
2630 if (compiler->one_byte()) {
2631 intptr_t dummy = 0;
2632 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
2633 }
2634
2635 bool first_elt_done = false;
2636 intptr_t bound_checked_to = trace->cp_offset() - 1;
2637 bound_checked_to += trace->bound_checked_up_to();
2638
2639 // If a character is preloaded into the current character register then
2640 // check that now.
2641 if (trace->characters_preloaded() == 1) {
2642 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {
2643 if (!SkipPass(pass, compiler->ignore_case())) {
2644 TextEmitPass(compiler,
2645 static_cast<TextEmitPassType>(pass),
2646 true,
2647 trace,
2648 false,
2649 &bound_checked_to);
2650 }
2651 }
2652 first_elt_done = true;
2653 }
2654
2655 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {
2656 if (!SkipPass(pass, compiler->ignore_case())) {
2657 TextEmitPass(compiler,
2658 static_cast<TextEmitPassType>(pass),
2659 false,
2660 trace,
2661 first_elt_done,
2662 &bound_checked_to);
2663 }
2664 }
2665
2666 Trace successor_trace(*trace);
2667 successor_trace.set_at_start(false);
2668 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
2669 RecursionCheck rc(compiler);
2670 on_success()->Emit(compiler, &successor_trace);
2671 }
2672
2673
2674 void Trace::InvalidateCurrentCharacter() {
2675 characters_preloaded_ = 0;
2676 }
2677
2678
2679 void Trace::AdvanceCurrentPositionInTrace(intptr_t by,
2680 RegExpCompiler* compiler) {
2681 ASSERT(by > 0);
2682 // We don't have an instruction for shifting the current character register
2683 // down or for using a shifted value for anything so lets just forget that
2684 // we preloaded any characters into it.
2685 characters_preloaded_ = 0;
2686 // Adjust the offsets of the quick check performed information. This
2687 // information is used to find out what we already determined about the
2688 // characters by means of mask and compare.
2689 quick_check_performed_.Advance(by, compiler->one_byte());
2690 cp_offset_ += by;
2691 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2692 compiler->SetRegExpTooBig();
2693 cp_offset_ = 0;
2694 }
2695 bound_checked_up_to_ = Utils::Maximum(static_cast<intptr_t>(0),
2696 bound_checked_up_to_ - by);
2697 }
2698
2699
2700 void TextNode::MakeCaseIndependent(bool is_one_byte) {
2701 intptr_t element_count = elms_->length();
2702 for (intptr_t i = 0; i < element_count; i++) {
2703 TextElement elm = elms_->At(i);
2704 if (elm.text_type() == TextElement::CHAR_CLASS) {
2705 RegExpCharacterClass* cc = elm.char_class();
2706 // None of the standard character classes is different in the case
2707 // independent case and it slows us down if we don't know that.
2708 if (cc->is_standard()) continue;
2709 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
2710 intptr_t range_count = ranges->length();
2711 for (intptr_t j = 0; j < range_count; j++) {
2712 (*ranges)[j].AddCaseEquivalents(ranges, is_one_byte, I);
2713 }
2714 }
2715 }
2716 }
2717
2718
2719 intptr_t TextNode::GreedyLoopTextLength() {
2720 TextElement elm = elms_->At(elms_->length() - 1);
2721 return elm.cp_offset() + elm.length();
2722 }
2723
2724
2725 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
2726 RegExpCompiler* compiler) {
2727 if (elms_->length() != 1) return NULL;
2728 TextElement elm = elms_->At(0);
2729 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
2730 RegExpCharacterClass* node = elm.char_class();
2731 ZoneGrowableArray<CharacterRange>* ranges = node->ranges();
2732 if (!CharacterRange::IsCanonical(ranges)) {
2733 CharacterRange::Canonicalize(ranges);
2734 }
2735 if (node->is_negated()) {
2736 return ranges->length() == 0 ? on_success() : NULL;
2737 }
2738 if (ranges->length() != 1) return NULL;
2739 uint32_t max_char;
2740 if (compiler->one_byte()) {
2741 max_char = Symbols::kMaxOneCharCodeSymbol;
2742 } else {
2743 max_char = Utf16::kMaxCodeUnit;
2744 }
2745 return ranges->At(0).IsEverything(max_char) ? on_success() : NULL;
2746 }
2747
2748
2749 // Finds the fixed match length of a sequence of nodes that goes from
2750 // this alternative and back to this choice node. If there are variable
2751 // length nodes or other complications in the way then return a sentinel
2752 // value indicating that a greedy loop cannot be constructed.
2753 intptr_t ChoiceNode::GreedyLoopTextLengthForAlternative(
2754 GuardedAlternative* alternative) {
2755 intptr_t length = 0;
2756 RegExpNode* node = alternative->node();
2757 // Later we will generate code for all these text nodes using recursion
2758 // so we have to limit the max number.
2759 intptr_t recursion_depth = 0;
2760 while (node != this) {
2761 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2762 return kNodeIsTooComplexForGreedyLoops;
2763 }
2764 intptr_t node_length = node->GreedyLoopTextLength();
2765 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2766 return kNodeIsTooComplexForGreedyLoops;
2767 }
2768 length += node_length;
2769 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2770 node = seq_node->on_success();
2771 }
2772 return length;
2773 }
2774
2775
2776 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2777 ASSERT(loop_node_ == NULL);
2778 AddAlternative(alt);
2779 loop_node_ = alt.node();
2780 }
2781
2782
2783 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2784 ASSERT(continue_node_ == NULL);
2785 AddAlternative(alt);
2786 continue_node_ = alt.node();
2787 }
2788
2789
2790 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2791 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2792 if (trace->stop_node() == this) {
2793 // Back edge of greedy optimized loop node graph.
2794 intptr_t text_length =
2795 GreedyLoopTextLengthForAlternative(&((*alternatives_)[0]));
2796 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2797 // Update the counter-based backtracking info on the stack. This is an
2798 // optimization for greedy loops (see below).
2799 ASSERT(trace->cp_offset() == text_length);
2800 macro_assembler->AdvanceCurrentPosition(text_length);
2801 macro_assembler->GoTo(trace->loop_label());
2802 return;
2803 }
2804 ASSERT(trace->stop_node() == NULL);
2805 if (!trace->is_trivial()) {
2806 trace->Flush(compiler, this);
2807 return;
2808 }
2809 ChoiceNode::Emit(compiler, trace);
2810 }
2811
2812
2813 intptr_t ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
2814 intptr_t eats_at_least) {
2815 intptr_t preload_characters = Utils::Minimum(static_cast<intptr_t>(4),
2816 eats_at_least);
2817 if (compiler->macro_assembler()->CanReadUnaligned()) {
2818 bool one_byte = compiler->one_byte();
2819 if (one_byte) {
2820 if (preload_characters > 4) preload_characters = 4;
2821 // We can't preload 3 characters because there is no machine instruction
2822 // to do that. We can't just load 4 because we could be reading
2823 // beyond the end of the string, which could cause a memory fault.
2824 if (preload_characters == 3) preload_characters = 2;
2825 } else {
2826 if (preload_characters > 2) preload_characters = 2;
2827 }
2828 } else {
2829 if (preload_characters > 1) preload_characters = 1;
2830 }
2831 return preload_characters;
2832 }
2833
2834
2835 // This structure is used when generating the alternatives in a choice node. It
2836 // records the way the alternative is being code generated.
2837 struct AlternativeGeneration {
2838 AlternativeGeneration()
2839 : possible_success(),
2840 expects_preload(false),
2841 after(),
2842 quick_check_details() { }
2843 BlockLabel possible_success;
2844 bool expects_preload;
2845 BlockLabel after;
2846 QuickCheckDetails quick_check_details;
2847 };
2848
2849
2850 // Creates a list of AlternativeGenerations. If the list has a reasonable
2851 // size then it is on the stack, otherwise the excess is on the heap.
2852 class AlternativeGenerationList {
2853 public:
2854 explicit AlternativeGenerationList(intptr_t count)
2855 : alt_gens_(count) {
2856 for (intptr_t i = 0; i < count && i < kAFew; i++) {
2857 alt_gens_.Add(a_few_alt_gens_ + i);
2858 }
2859 for (intptr_t i = kAFew; i < count; i++) {
2860 alt_gens_.Add(new AlternativeGeneration());
2861 }
2862 }
2863 ~AlternativeGenerationList() {
2864 for (intptr_t i = kAFew; i < alt_gens_.length(); i++) {
2865 delete alt_gens_[i];
2866 alt_gens_[i] = NULL;
2867 }
2868 }
2869
2870 AlternativeGeneration* at(intptr_t i) {
2871 return alt_gens_[i];
2872 }
2873
2874 private:
2875 static const intptr_t kAFew = 10;
2876 GrowableArray<AlternativeGeneration*> alt_gens_;
2877 AlternativeGeneration a_few_alt_gens_[kAFew];
2878
2879 DISALLOW_ALLOCATION();
2880 };
2881
2882
2883 // The '2' variant is inclusive from and exclusive to.
2884 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
2885 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
2886 static const intptr_t kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
2887 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
2888 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
2889 0xFEFF, 0xFF00, 0x10000 };
2890 static const intptr_t kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
2891 static const intptr_t kWordRanges[] = {
2892 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
2893 static const intptr_t kWordRangeCount = ARRAY_SIZE(kWordRanges);
2894 static const intptr_t kDigitRanges[] = { '0', '9' + 1, 0x10000 };
2895 static const intptr_t kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
2896 static const intptr_t kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
2897 static const intptr_t kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges);
2898 static const intptr_t kLineTerminatorRanges[] = {
2899 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, 0x10000 };
2900 static const intptr_t kLineTerminatorRangeCount =
2901 ARRAY_SIZE(kLineTerminatorRanges);
2902
2903
2904 void BoyerMoorePositionInfo::Set(intptr_t character) {
2905 SetInterval(Interval(character, character));
2906 }
2907
2908
2909 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
2910 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
2911 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
2912 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
2913 surrogate_ =
2914 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
2915 if (interval.to() - interval.from() >= kMapSize - 1) {
2916 if (map_count_ != kMapSize) {
2917 map_count_ = kMapSize;
2918 for (intptr_t i = 0; i < kMapSize; i++) (*map_)[i] = true;
2919 }
2920 return;
2921 }
2922 for (intptr_t i = interval.from(); i <= interval.to(); i++) {
2923 intptr_t mod_character = (i & kMask);
2924 if (!map_->At(mod_character)) {
2925 map_count_++;
2926 (*map_)[mod_character] = true;
2927 }
2928 if (map_count_ == kMapSize) return;
2929 }
2930 }
2931
2932
2933 void BoyerMoorePositionInfo::SetAll() {
2934 s_ = w_ = d_ = kLatticeUnknown;
2935 if (map_count_ != kMapSize) {
2936 map_count_ = kMapSize;
2937 for (intptr_t i = 0; i < kMapSize; i++) (*map_)[i] = true;
2938 }
2939 }
2940
2941
2942 BoyerMooreLookahead::BoyerMooreLookahead(
2943 intptr_t length, RegExpCompiler* compiler, Isolate* isolate)
2944 : length_(length),
2945 compiler_(compiler) {
2946 if (compiler->one_byte()) {
2947 max_char_ = Symbols::kMaxOneCharCodeSymbol;
2948 } else {
2949 max_char_ = Utf16::kMaxCodeUnit;
2950 }
2951 bitmaps_ = new(isolate) ZoneGrowableArray<BoyerMoorePositionInfo*>(length);
2952 for (intptr_t i = 0; i < length; i++) {
2953 bitmaps_->Add(new(isolate) BoyerMoorePositionInfo(isolate));
2954 }
2955 }
2956
2957
2958 // Find the longest range of lookahead that has the fewest number of different
2959 // characters that can occur at a given position. Since we are optimizing two
2960 // different parameters at once this is a tradeoff.
2961 bool BoyerMooreLookahead::FindWorthwhileInterval(intptr_t* from, intptr_t* to) {
2962 intptr_t biggest_points = 0;
2963 // If more than 32 characters out of 128 can occur it is unlikely that we can
2964 // be lucky enough to step forwards much of the time.
2965 const intptr_t kMaxMax = 32;
2966 for (intptr_t max_number_of_chars = 4;
2967 max_number_of_chars < kMaxMax;
2968 max_number_of_chars *= 2) {
2969 biggest_points =
2970 FindBestInterval(max_number_of_chars, biggest_points, from, to);
2971 }
2972 if (biggest_points == 0) return false;
2973 return true;
2974 }
2975
2976
2977 // Find the highest-points range between 0 and length_ where the character
2978 // information is not too vague. 'Too vague' means that there are more than
2979 // max_number_of_chars that can occur at this position. Calculates the number
2980 // of points as the product of width-of-the-range and
2981 // probability-of-finding-one-of-the-characters, where the probability is
2982 // calculated using the frequency distribution of the sample subject string.
2983 intptr_t BoyerMooreLookahead::FindBestInterval(
2984 intptr_t max_number_of_chars,
2985 intptr_t old_biggest_points,
2986 intptr_t* from,
2987 intptr_t* to) {
2988 intptr_t biggest_points = old_biggest_points;
2989 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;
2990 for (intptr_t i = 0; i < length_; ) {
2991 while (i < length_ && Count(i) > max_number_of_chars) i++;
2992 if (i == length_) break;
2993 intptr_t remembered_from = i;
2994 bool union_map[kSize];
2995 for (intptr_t j = 0; j < kSize; j++) union_map[j] = false;
2996 while (i < length_ && Count(i) <= max_number_of_chars) {
2997 BoyerMoorePositionInfo* map = bitmaps_->At(i);
2998 for (intptr_t j = 0; j < kSize; j++) union_map[j] |= map->at(j);
2999 i++;
3000 }
3001 intptr_t frequency = 0;
3002 for (intptr_t j = 0; j < kSize; j++) {
3003 if (union_map[j]) {
3004 // Add 1 to the frequency to give a small per-character boost for
3005 // the cases where our sampling is not good enough and many
3006 // characters have a frequency of zero. This means the frequency
3007 // can theoretically be up to 2*kSize though we treat it mostly as
3008 // a fraction of kSize.
3009 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3010 }
3011 }
3012 // We use the probability of skipping times the distance we are skipping to
3013 // judge the effectiveness of this. Actually we have a cut-off: By
3014 // dividing by 2 we switch off the skipping if the probability of skipping
3015 // is less than 50%. This is because the multibyte mask-and-compare
3016 // skipping in quickcheck is more likely to do well on this case.
3017 bool in_quickcheck_range = ((i - remembered_from < 4) ||
3018 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3019 // Called 'probability' but it is only a rough estimate and can actually
3020 // be outside the 0-kSize range.
3021 intptr_t probability =
3022 (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3023 intptr_t points = (i - remembered_from) * probability;
3024 if (points > biggest_points) {
3025 *from = remembered_from;
3026 *to = i - 1;
3027 biggest_points = points;
3028 }
3029 }
3030 return biggest_points;
3031 }
3032
3033
3034 // Take all the characters that will not prevent a successful match if they
3035 // occur in the subject string in the range between min_lookahead and
3036 // max_lookahead (inclusive) measured from the current position. If the
3037 // character at max_lookahead offset is not one of these characters, then we
3038 // can safely skip forwards by the number of characters in the range.
3039 intptr_t BoyerMooreLookahead::GetSkipTable(
3040 intptr_t min_lookahead,
3041 intptr_t max_lookahead,
3042 const TypedData& boolean_skip_table) {
3043 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
3044
3045 const intptr_t kSkipArrayEntry = 0;
3046 const intptr_t kDontSkipArrayEntry = 1;
3047
3048 for (intptr_t i = 0; i < kSize; i++) {
3049 boolean_skip_table.SetUint8(i, kSkipArrayEntry);
3050 }
3051 intptr_t skip = max_lookahead + 1 - min_lookahead;
3052
3053 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {
3054 BoyerMoorePositionInfo* map = bitmaps_->At(i);
3055 for (intptr_t j = 0; j < kSize; j++) {
3056 if (map->at(j)) {
3057 boolean_skip_table.SetUint8(j, kDontSkipArrayEntry);
3058 }
3059 }
3060 }
3061
3062 return skip;
3063 }
3064
3065
3066 // See comment above on the implementation of GetSkipTable.
3067 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3068 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
3069
3070 intptr_t min_lookahead = 0;
3071 intptr_t max_lookahead = 0;
3072
3073 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3074
3075 bool found_single_character = false;
3076 intptr_t single_character = 0;
3077 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {
3078 BoyerMoorePositionInfo* map = bitmaps_->At(i);
3079 if (map->map_count() > 1 ||
3080 (found_single_character && map->map_count() != 0)) {
3081 found_single_character = false;
3082 break;
3083 }
3084 for (intptr_t j = 0; j < kSize; j++) {
3085 if (map->at(j)) {
3086 found_single_character = true;
3087 single_character = j;
3088 break;
3089 }
3090 }
3091 }
3092
3093 intptr_t lookahead_width = max_lookahead + 1 - min_lookahead;
3094
3095 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3096 // The mask-compare can probably handle this better.
3097 return;
3098 }
3099
3100 if (found_single_character) {
3101 BlockLabel cont, again;
3102 masm->BindBlock(&again);
3103 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3104 if (max_char_ > kSize) {
3105 masm->CheckCharacterAfterAnd(single_character,
3106 RegExpMacroAssembler::kTableMask,
3107 &cont);
3108 } else {
3109 masm->CheckCharacter(single_character, &cont);
3110 }
3111 masm->AdvanceCurrentPosition(lookahead_width);
3112 masm->GoTo(&again);
3113 masm->BindBlock(&cont);
3114 return;
3115 }
3116
3117 const TypedData& boolean_skip_table = TypedData::ZoneHandle(
3118 compiler_->isolate(),
3119 TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));
3120 intptr_t skip_distance = GetSkipTable(
3121 min_lookahead, max_lookahead, boolean_skip_table);
3122 ASSERT(skip_distance != 0);
3123
3124 BlockLabel cont, again;
3125
3126 masm->BindBlock(&again);
3127 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3128 masm->CheckBitInTable(boolean_skip_table, &cont);
3129 masm->AdvanceCurrentPosition(skip_distance);
3130 masm->GoTo(&again);
3131 masm->BindBlock(&cont);
3132
3133 return;
3134 }
3135
3136
3137 /* Code generation for choice nodes.
3138 *
3139 * We generate quick checks that do a mask and compare to eliminate a
3140 * choice. If the quick check succeeds then it jumps to the continuation to
3141 * do slow checks and check subsequent nodes. If it fails (the common case)
3142 * it falls through to the next choice.
3143 *
3144 * Here is the desired flow graph. Nodes directly below each other imply
3145 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3146 * 3 doesn't have a quick check so we have to call the slow check.
3147 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3148 * regexp continuation is generated directly after the Sn node, up to the
3149 * next GoTo if we decide to reuse some already generated code. Some
3150 * nodes expect preload_characters to be preloaded into the current
3151 * character register. R nodes do this preloading. Vertices are marked
3152 * F for failures and S for success (possible success in the case of quick
3153 * nodes). L, V, < and > are used as arrow heads.
3154 *
3155 * ----------> R
3156 * |
3157 * V
3158 * Q1 -----> S1
3159 * | S /
3160 * F| /
3161 * | F/
3162 * | /
3163 * | R
3164 * | /
3165 * V L
3166 * Q2 -----> S2
3167 * | S /
3168 * F| /
3169 * | F/
3170 * | /
3171 * | R
3172 * | /
3173 * V L
3174 * S3
3175 * |
3176 * F|
3177 * |
3178 * R
3179 * |
3180 * backtrack V
3181 * <----------Q4
3182 * \ F |
3183 * \ |S
3184 * \ F V
3185 * \-----S4
3186 *
3187 * For greedy loops we push the current position, then generate the code that
3188 * eats the input specially in EmitGreedyLoop. The other choice (the
3189 * continuation) is generated by the normal code in EmitChoices, and steps back
3190 * in the input to the starting position when it fails to match. The loop code
3191 * looks like this (U is the unwind code that steps back in the greedy loop).
3192 *
3193 * _____
3194 * / \
3195 * V |
3196 * ----------> S1 |
3197 * /| |
3198 * / |S |
3199 * F/ \_____/
3200 * /
3201 * |<-----
3202 * | \
3203 * V |S
3204 * Q2 ---> U----->backtrack
3205 * | F /
3206 * S| /
3207 * V F /
3208 * S2--/
3209 */
3210
3211 GreedyLoopState::GreedyLoopState(bool not_at_start) {
3212 counter_backtrack_trace_.set_backtrack(&label_);
3213 if (not_at_start) counter_backtrack_trace_.set_at_start(false);
3214 }
3215
3216
3217 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3218 #ifdef DEBUG
3219 intptr_t choice_count = alternatives_->length();
3220 for (intptr_t i = 0; i < choice_count - 1; i++) {
3221 GuardedAlternative alternative = alternatives_->At(i);
3222 ZoneGrowableArray<Guard*>* guards = alternative.guards();
3223 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();
3224 for (intptr_t j = 0; j < guard_count; j++) {
3225 ASSERT(!trace->mentions_reg(guards->At(j)->reg()));
3226 }
3227 }
3228 #endif
3229 }
3230
3231
3232 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3233 Trace* current_trace,
3234 PreloadState* state) {
3235 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3236 // Save some time by looking at most one machine word ahead.
3237 state->eats_at_least_ =
3238 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3239 current_trace->at_start() == Trace::FALSE_VALUE);
3240 }
3241 state->preload_characters_ =
3242 CalculatePreloadCharacters(compiler, state->eats_at_least_);
3243
3244 state->preload_is_current_ =
3245 (current_trace->characters_preloaded() == state->preload_characters_);
3246 state->preload_has_checked_bounds_ = state->preload_is_current_;
3247 }
3248
3249
3250 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3251 intptr_t choice_count = alternatives_->length();
3252
3253 AssertGuardsMentionRegisters(trace);
3254
3255 LimitResult limit_result = LimitVersions(compiler, trace);
3256 if (limit_result == DONE) return;
3257 ASSERT(limit_result == CONTINUE);
3258
3259 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3260 // other choice nodes we only flush if we are out of code size budget.
3261 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3262 trace->Flush(compiler, this);
3263 return;
3264 }
3265
3266 RecursionCheck rc(compiler);
3267
3268 PreloadState preload;
3269 preload.init();
3270 GreedyLoopState greedy_loop_state(not_at_start());
3271
3272 intptr_t text_length =
3273 GreedyLoopTextLengthForAlternative(&((*alternatives_)[0]));
3274 AlternativeGenerationList alt_gens(choice_count);
3275
3276 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3277 trace = EmitGreedyLoop(compiler,
3278 trace,
3279 &alt_gens,
3280 &preload,
3281 &greedy_loop_state,
3282 text_length);
3283 } else {
3284 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3285 // match the traces produced pre-cleanup.
3286 BlockLabel second_choice;
3287 compiler->macro_assembler()->BindBlock(&second_choice);
3288
3289 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3290
3291 EmitChoices(compiler,
3292 &alt_gens,
3293 0,
3294 trace,
3295 &preload);
3296 }
3297
3298 // At this point we need to generate slow checks for the alternatives where
3299 // the quick check was inlined. We can recognize these because the associated
3300 // label was bound.
3301 intptr_t new_flush_budget = trace->flush_budget() / choice_count;
3302 for (intptr_t i = 0; i < choice_count; i++) {
3303 AlternativeGeneration* alt_gen = alt_gens.at(i);
3304 Trace new_trace(*trace);
3305 // If there are actions to be flushed we have to limit how many times
3306 // they are flushed. Take the budget of the parent trace and distribute
3307 // it fairly amongst the children.
3308 if (new_trace.actions() != NULL) {
3309 new_trace.set_flush_budget(new_flush_budget);
3310 }
3311 bool next_expects_preload =
3312 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3313 EmitOutOfLineContinuation(compiler,
3314 &new_trace,
3315 alternatives_->At(i),
3316 alt_gen,
3317 preload.preload_characters_,
3318 next_expects_preload);
3319 }
3320 }
3321
3322 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
3323 Trace* trace,
3324 AlternativeGenerationList* alt_gens,
3325 PreloadState* preload,
3326 GreedyLoopState* greedy_loop_state,
3327 intptr_t text_length) {
3328 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3329 // Here we have special handling for greedy loops containing only text nodes
3330 // and other simple nodes. These are handled by pushing the current
3331 // position on the stack and then incrementing the current position each
3332 // time around the switch. On backtrack we decrement the current position
3333 // and check it against the pushed value. This avoids pushing backtrack
3334 // information for each iteration of the loop, which could take up a lot of
3335 // space.
3336 ASSERT(trace->stop_node() == NULL);
3337 macro_assembler->PushCurrentPosition();
3338 BlockLabel greedy_match_failed;
3339 Trace greedy_match_trace;
3340 if (not_at_start()) greedy_match_trace.set_at_start(false);
3341 greedy_match_trace.set_backtrack(&greedy_match_failed);
3342 BlockLabel loop_label;
3343 macro_assembler->BindBlock(&loop_label);
3344 greedy_match_trace.set_stop_node(this);
3345 greedy_match_trace.set_loop_label(&loop_label);
3346 (*alternatives_)[0].node()->Emit(compiler, &greedy_match_trace);
3347 macro_assembler->BindBlock(&greedy_match_failed);
3348
3349 BlockLabel second_choice; // For use in greedy matches.
3350 macro_assembler->BindBlock(&second_choice);
3351
3352 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
3353
3354 EmitChoices(compiler,
3355 alt_gens,
3356 1,
3357 new_trace,
3358 preload);
3359
3360 macro_assembler->BindBlock(greedy_loop_state->label());
3361 // If we have unwound to the bottom then backtrack.
3362 macro_assembler->CheckGreedyLoop(trace->backtrack());
3363 // Otherwise try the second priority at an earlier position.
3364 macro_assembler->AdvanceCurrentPosition(-text_length);
3365 macro_assembler->GoTo(&second_choice);
3366 return new_trace;
3367 }
3368
3369
3370 intptr_t ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
3371 Trace* trace) {
3372 intptr_t eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
3373 if (alternatives_->length() != 2) return eats_at_least;
3374
3375 GuardedAlternative alt1 = alternatives_->At(1);
3376 if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
3377 return eats_at_least;
3378 }
3379 RegExpNode* eats_anything_node = alt1.node();
3380 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
3381 return eats_at_least;
3382 }
3383
3384 // Really we should be creating a new trace when we execute this function,
3385 // but there is no need, because the code it generates cannot backtrack, and
3386 // we always arrive here with a trivial trace (since it's the entry to a
3387 // loop. That also implies that there are no preloaded characters, which is
3388 // good, because it means we won't be violating any assumptions by
3389 // overwriting those characters with new load instructions.
3390 ASSERT(trace->is_trivial());
3391
3392 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3393 // At this point we know that we are at a non-greedy loop that will eat
3394 // any character one at a time. Any non-anchored regexp has such a
3395 // loop prepended to it in order to find where it starts. We look for
3396 // a pattern of the form ...abc... where we can look 6 characters ahead
3397 // and step forwards 3 if the character is not one of abc. Abc need
3398 // not be atoms, they can be any reasonably limited character class or
3399 // small alternation.
3400 BoyerMooreLookahead* bm = bm_info(false);
3401 if (bm == NULL) {
3402 eats_at_least = Utils::Minimum(kMaxLookaheadForBoyerMoore,
3403 EatsAtLeast(kMaxLookaheadForBoyerMoore,
3404 kRecursionBudget,
3405 false));
3406 if (eats_at_least >= 1) {
3407 bm = new(I) BoyerMooreLookahead(eats_at_least, compiler, I);
3408 GuardedAlternative alt0 = alternatives_->At(0);
3409 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, false);
3410 }
3411 }
3412 if (bm != NULL) {
3413 bm->EmitSkipInstructions(macro_assembler);
3414 }
3415 return eats_at_least;
3416 }
3417
3418
3419 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
3420 AlternativeGenerationList* alt_gens,
3421 intptr_t first_choice,
3422 Trace* trace,
3423 PreloadState* preload) {
3424 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3425 SetUpPreLoad(compiler, trace, preload);
3426
3427 // For now we just call all choices one after the other. The idea ultimately
3428 // is to use the Dispatch table to try only the relevant ones.
3429 intptr_t choice_count = alternatives_->length();
3430
3431 intptr_t new_flush_budget = trace->flush_budget() / choice_count;
3432
3433 for (intptr_t i = first_choice; i < choice_count; i++) {
3434 bool is_last = i == choice_count - 1;
3435 bool fall_through_on_failure = !is_last;
3436 GuardedAlternative alternative = alternatives_->At(i);
3437 AlternativeGeneration* alt_gen = alt_gens->at(i);
3438 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
3439 ZoneGrowableArray<Guard*>* guards = alternative.guards();
3440 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();
3441 Trace new_trace(*trace);
3442 new_trace.set_characters_preloaded(preload->preload_is_current_ ?
3443 preload->preload_characters_ :
3444 0);
3445 if (preload->preload_has_checked_bounds_) {
3446 new_trace.set_bound_checked_up_to(preload->preload_characters_);
3447 }
3448 new_trace.quick_check_performed()->Clear();
3449 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
3450 if (!is_last) {
3451 new_trace.set_backtrack(&alt_gen->after);
3452 }
3453 alt_gen->expects_preload = preload->preload_is_current_;
3454 bool generate_full_check_inline = false;
3455 if (kRegexpOptimization &&
3456 try_to_emit_quick_check_for_alternative(i == 0) &&
3457 alternative.node()->EmitQuickCheck(compiler,
3458 trace,
3459 &new_trace,
3460 preload->preload_has_checked_bounds_,
3461 &alt_gen->possible_success,
3462 &alt_gen->quick_check_details,
3463 fall_through_on_failure)) {
3464 // Quick check was generated for this choice.
3465 preload->preload_is_current_ = true;
3466 preload->preload_has_checked_bounds_ = true;
3467 // If we generated the quick check to fall through on possible success,
3468 // we now need to generate the full check inline.
3469 if (!fall_through_on_failure) {
3470 macro_assembler->BindBlock(&alt_gen->possible_success);
3471 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3472 new_trace.set_characters_preloaded(preload->preload_characters_);
3473 new_trace.set_bound_checked_up_to(preload->preload_characters_);
3474 generate_full_check_inline = true;
3475 }
3476 } else if (alt_gen->quick_check_details.cannot_match()) {
3477 if (!fall_through_on_failure) {
3478 macro_assembler->GoTo(trace->backtrack());
3479 }
3480 continue;
3481 } else {
3482 // No quick check was generated. Put the full code here.
3483 // If this is not the first choice then there could be slow checks from
3484 // previous cases that go here when they fail. There's no reason to
3485 // insist that they preload characters since the slow check we are about
3486 // to generate probably can't use it.
3487 if (i != first_choice) {
3488 alt_gen->expects_preload = false;
3489 new_trace.InvalidateCurrentCharacter();
3490 }
3491 generate_full_check_inline = true;
3492 }
3493 if (generate_full_check_inline) {
3494 if (new_trace.actions() != NULL) {
3495 new_trace.set_flush_budget(new_flush_budget);
3496 }
3497 for (intptr_t j = 0; j < guard_count; j++) {
3498 GenerateGuard(macro_assembler, guards->At(j), &new_trace);
3499 }
3500 alternative.node()->Emit(compiler, &new_trace);
3501 preload->preload_is_current_ = false;
3502 }
3503 macro_assembler->BindBlock(&alt_gen->after);
3504 }
3505 }
3506
3507
3508 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
3509 Trace* trace,
3510 GuardedAlternative alternative,
3511 AlternativeGeneration* alt_gen,
3512 intptr_t preload_characters,
3513 bool next_expects_preload) {
3514 if (!alt_gen->possible_success.IsLinked()) return;
3515
3516 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3517 macro_assembler->BindBlock(&alt_gen->possible_success);
3518 Trace out_of_line_trace(*trace);
3519 out_of_line_trace.set_characters_preloaded(preload_characters);
3520 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3521 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
3522 ZoneGrowableArray<Guard*>* guards = alternative.guards();
3523 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();
3524 if (next_expects_preload) {
3525 BlockLabel reload_current_char;
3526 out_of_line_trace.set_backtrack(&reload_current_char);
3527 for (intptr_t j = 0; j < guard_count; j++) {
3528 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);
3529 }
3530 alternative.node()->Emit(compiler, &out_of_line_trace);
3531 macro_assembler->BindBlock(&reload_current_char);
3532 // Reload the current character, since the next quick check expects that.
3533 // We don't need to check bounds here because we only get into this
3534 // code through a quick check which already did the checked load.
3535 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
3536 NULL,
3537 false,
3538 preload_characters);
3539 macro_assembler->GoTo(&(alt_gen->after));
3540 } else {
3541 out_of_line_trace.set_backtrack(&(alt_gen->after));
3542 for (intptr_t j = 0; j < guard_count; j++) {
3543 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);
3544 }
3545 alternative.node()->Emit(compiler, &out_of_line_trace);
3546 }
3547 }
3548
3549
3550 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3551 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3552 LimitResult limit_result = LimitVersions(compiler, trace);
3553 if (limit_result == DONE) return;
3554 ASSERT(limit_result == CONTINUE);
3555
3556 RecursionCheck rc(compiler);
3557
3558 switch (action_type_) {
3559 case STORE_POSITION: {
3560 Trace::DeferredCapture
3561 new_capture(data_.u_position_register.reg,
3562 data_.u_position_register.is_capture,
3563 trace);
3564 Trace new_trace = *trace;
3565 new_trace.add_action(&new_capture);
3566 on_success()->Emit(compiler, &new_trace);
3567 break;
3568 }
3569 case INCREMENT_REGISTER: {
3570 Trace::DeferredIncrementRegister
3571 new_increment(data_.u_increment_register.reg);
3572 Trace new_trace = *trace;
3573 new_trace.add_action(&new_increment);
3574 on_success()->Emit(compiler, &new_trace);
3575 break;
3576 }
3577 case SET_REGISTER: {
3578 Trace::DeferredSetRegister
3579 new_set(data_.u_store_register.reg, data_.u_store_register.value);
3580 Trace new_trace = *trace;
3581 new_trace.add_action(&new_set);
3582 on_success()->Emit(compiler, &new_trace);
3583 break;
3584 }
3585 case CLEAR_CAPTURES: {
3586 Trace::DeferredClearCaptures
3587 new_capture(Interval(data_.u_clear_captures.range_from,
3588 data_.u_clear_captures.range_to));
3589 Trace new_trace = *trace;
3590 new_trace.add_action(&new_capture);
3591 on_success()->Emit(compiler, &new_trace);
3592 break;
3593 }
3594 case BEGIN_SUBMATCH:
3595 if (!trace->is_trivial()) {
3596 trace->Flush(compiler, this);
3597 } else {
3598 assembler->WriteCurrentPositionToRegister(
3599 data_.u_submatch.current_position_register, 0);
3600 assembler->WriteStackPointerToRegister(
3601 data_.u_submatch.stack_pointer_register);
3602 on_success()->Emit(compiler, trace);
3603 }
3604 break;
3605 case EMPTY_MATCH_CHECK: {
3606 intptr_t start_pos_reg = data_.u_empty_match_check.start_register;
3607 intptr_t stored_pos = 0;
3608 intptr_t rep_reg = data_.u_empty_match_check.repetition_register;
3609 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3610 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3611 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3612 // If we know we haven't advanced and there is no minimum we
3613 // can just backtrack immediately.
3614 assembler->GoTo(trace->backtrack());
3615 } else if (know_dist && stored_pos < trace->cp_offset()) {
3616 // If we know we've advanced we can generate the continuation
3617 // immediately.
3618 on_success()->Emit(compiler, trace);
3619 } else if (!trace->is_trivial()) {
3620 trace->Flush(compiler, this);
3621 } else {
3622 BlockLabel skip_empty_check;
3623 // If we have a minimum number of repetitions we check the current
3624 // number first and skip the empty check if it's not enough.
3625 if (has_minimum) {
3626 intptr_t limit = data_.u_empty_match_check.repetition_limit;
3627 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3628 }
3629 // If the match is empty we bail out, otherwise we fall through
3630 // to the on-success continuation.
3631 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3632 trace->backtrack());
3633 assembler->BindBlock(&skip_empty_check);
3634 on_success()->Emit(compiler, trace);
3635 }
3636 break;
3637 }
3638 case POSITIVE_SUBMATCH_SUCCESS: {
3639 if (!trace->is_trivial()) {
3640 trace->Flush(compiler, this);
3641 return;
3642 }
3643 assembler->ReadCurrentPositionFromRegister(
3644 data_.u_submatch.current_position_register);
3645 assembler->ReadStackPointerFromRegister(
3646 data_.u_submatch.stack_pointer_register);
3647 intptr_t clear_register_count = data_.u_submatch.clear_register_count;
3648 if (clear_register_count == 0) {
3649 on_success()->Emit(compiler, trace);
3650 return;
3651 }
3652 intptr_t clear_registers_from = data_.u_submatch.clear_register_from;
3653 BlockLabel clear_registers_backtrack;
3654 Trace new_trace = *trace;
3655 new_trace.set_backtrack(&clear_registers_backtrack);
3656 on_success()->Emit(compiler, &new_trace);
3657
3658 assembler->BindBlock(&clear_registers_backtrack);
3659 intptr_t clear_registers_to =
3660 clear_registers_from + clear_register_count - 1;
3661 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3662
3663 ASSERT(trace->backtrack() == NULL);
3664 assembler->Backtrack();
3665 return;
3666 }
3667 default:
3668 UNREACHABLE();
3669 }
3670 }
3671
3672
3673 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3674 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3675 if (!trace->is_trivial()) {
3676 trace->Flush(compiler, this);
3677 return;
3678 }
3679
3680 LimitResult limit_result = LimitVersions(compiler, trace);
3681 if (limit_result == DONE) return;
3682 ASSERT(limit_result == CONTINUE);
3683
3684 RecursionCheck rc(compiler);
3685
3686 ASSERT(start_reg_ + 1 == end_reg_);
3687 if (compiler->ignore_case()) {
3688 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
3689 trace->backtrack());
3690 } else {
3691 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
3692 }
3693 on_success()->Emit(compiler, trace);
3694 }
3695
3696
3697 // -------------------------------------------------------------------
3698 // Dot/dotty output
3699
3700
3701 #ifdef DEBUG
3702
3703
3704 class DotPrinter: public NodeVisitor {
3705 public:
3706 explicit DotPrinter(bool ignore_case)
3707 : ignore_case_(ignore_case) {}
3708 void PrintNode(const char* label, RegExpNode* node);
3709 void Visit(RegExpNode* node);
3710 void PrintAttributes(RegExpNode* from);
3711 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
3712 #define DECLARE_VISIT(Type) \
3713 virtual void Visit##Type(Type##Node* that);
3714 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3715 #undef DECLARE_VISIT
3716 private:
3717 bool ignore_case_;
3718 };
3719
3720
3721 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3722 OS::Print("digraph G {\n graph [label=\"");
3723 for (intptr_t i = 0; label[i]; i++) {
3724 switch (label[i]) {
3725 case '\\':
3726 OS::Print("\\\\");
3727 break;
3728 case '"':
3729 OS::Print("\"");
3730 break;
3731 default:
3732 OS::Print("%c", label[i]);
3733 break;
3734 }
3735 }
3736 OS::Print("\"];\n");
3737 Visit(node);
3738 OS::Print("}\n");
3739 }
3740
3741
3742 void DotPrinter::Visit(RegExpNode* node) {
3743 if (node->info()->visited) return;
3744 node->info()->visited = true;
3745 node->Accept(this);
3746 }
3747
3748
3749 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
3750 OS::Print(" n%p -> n%p [style=dotted];\n", from, on_failure);
3751 Visit(on_failure);
3752 }
3753
3754
3755 class AttributePrinter : public ValueObject {
3756 public:
3757 AttributePrinter() : first_(true) {}
3758 void PrintSeparator() {
3759 if (first_) {
3760 first_ = false;
3761 } else {
3762 OS::Print("|");
3763 }
3764 }
3765 void PrintBit(const char* name, bool value) {
3766 if (!value) return;
3767 PrintSeparator();
3768 OS::Print("{%s}", name);
3769 }
3770 void PrintPositive(const char* name, intptr_t value) {
3771 if (value < 0) return;
3772 PrintSeparator();
3773 OS::Print("{%s|%" Pd "}", name, value);
3774 }
3775
3776 private:
3777 bool first_;
3778 };
3779
3780
3781 void DotPrinter::PrintAttributes(RegExpNode* that) {
3782 OS::Print(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3783 "margin=0.1, fontsize=10, label=\"{", that);
3784 AttributePrinter printer;
3785 NodeInfo* info = that->info();
3786 printer.PrintBit("NI", info->follows_newline_interest);
3787 printer.PrintBit("WI", info->follows_word_interest);
3788 printer.PrintBit("SI", info->follows_start_interest);
3789 BlockLabel* label = that->label();
3790 if (label->IsBound())
3791 printer.PrintPositive("@", label->Position());
3792 OS::Print("}\"];\n"
3793 " a%p -> n%p [style=dashed, color=grey, arrowhead=none];\n",
3794 that, that);
3795 }
3796
3797
3798 void DotPrinter::VisitChoice(ChoiceNode* that) {
3799 OS::Print(" n%p [shape=Mrecord, label=\"?\"];\n", that);
3800 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
3801 GuardedAlternative alt = that->alternatives()->At(i);
3802 OS::Print(" n%p -> n%p", that, alt.node());
3803 }
3804 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
3805 GuardedAlternative alt = that->alternatives()->At(i);
3806 alt.node()->Accept(this);
3807 }
3808 }
3809
3810
3811 void DotPrinter::VisitText(TextNode* that) {
3812 OS::Print(" n%p [label=\"", that);
3813 for (intptr_t i = 0; i < that->elements()->length(); i++) {
3814 if (i > 0) OS::Print(" ");
3815 TextElement elm = that->elements()->At(i);
3816 switch (elm.text_type()) {
3817 case TextElement::ATOM: {
3818 ZoneGrowableArray<uint16_t>* data = elm.atom()->data();
3819 for (intptr_t i = 0; i < data->length(); i++) {
3820 OS::Print("%c", static_cast<char>(data->At(i)));
3821 }
3822 break;
3823 }
3824 case TextElement::CHAR_CLASS: {
3825 RegExpCharacterClass* node = elm.char_class();
3826 OS::Print("[");
3827 if (node->is_negated()) OS::Print("^");
3828 for (intptr_t j = 0; j < node->ranges()->length(); j++) {
3829 CharacterRange range = node->ranges()->At(j);
3830 PrintUtf16(range.from());
3831 OS::Print("-");
3832 PrintUtf16(range.to());
3833 }
3834 OS::Print("]");
3835 break;
3836 }
3837 default:
3838 UNREACHABLE();
3839 }
3840 }
3841 OS::Print("\", shape=box, peripheries=2];\n");
3842 PrintAttributes(that);
3843 OS::Print(" n%p -> n%p;\n", that, that->on_success());
3844 Visit(that->on_success());
3845 }
3846
3847
3848 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3849 OS::Print(" n%p [label=\"$%" Pd "..$%" Pd "\", shape=doubleoctagon];\n",
3850 that, that->start_register(), that->end_register());
3851 PrintAttributes(that);
3852 OS::Print(" n%p -> n%p;\n", that, that->on_success());
3853 Visit(that->on_success());
3854 }
3855
3856
3857 void DotPrinter::VisitEnd(EndNode* that) {
3858 OS::Print(" n%p [style=bold, shape=point];\n", that);
3859 PrintAttributes(that);
3860 }
3861
3862
3863 void DotPrinter::VisitAssertion(AssertionNode* that) {
3864 OS::Print(" n%p [", that);
3865 switch (that->assertion_type()) {
3866 case AssertionNode::AT_END:
3867 OS::Print("label=\"$\", shape=septagon");
3868 break;
3869 case AssertionNode::AT_START:
3870 OS::Print("label=\"^\", shape=septagon");
3871 break;
3872 case AssertionNode::AT_BOUNDARY:
3873 OS::Print("label=\"\\b\", shape=septagon");
3874 break;
3875 case AssertionNode::AT_NON_BOUNDARY:
3876 OS::Print("label=\"\\B\", shape=septagon");
3877 break;
3878 case AssertionNode::AFTER_NEWLINE:
3879 OS::Print("label=\"(?<=\\n)\", shape=septagon");
3880 break;
3881 }
3882 OS::Print("];\n");
3883 PrintAttributes(that);
3884 RegExpNode* successor = that->on_success();
3885 OS::Print(" n%p -> n%p;\n", that, successor);
3886 Visit(successor);
3887 }
3888
3889
3890 void DotPrinter::VisitAction(ActionNode* that) {
3891 OS::Print(" n%p [", that);
3892 switch (that->action_type_) {
3893 case ActionNode::SET_REGISTER:
3894 OS::Print("label=\"$%" Pd ":=%" Pd "\", shape=octagon",
3895 that->data_.u_store_register.reg,
3896 that->data_.u_store_register.value);
3897 break;
3898 case ActionNode::INCREMENT_REGISTER:
3899 OS::Print("label=\"$%" Pd "++\", shape=octagon",
3900 that->data_.u_increment_register.reg);
3901 break;
3902 case ActionNode::STORE_POSITION:
3903 OS::Print("label=\"$%" Pd ":=$pos\", shape=octagon",
3904 that->data_.u_position_register.reg);
3905 break;
3906 case ActionNode::BEGIN_SUBMATCH:
3907 OS::Print("label=\"$%" Pd ":=$pos,begin\", shape=septagon",
3908 that->data_.u_submatch.current_position_register);
3909 break;
3910 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
3911 OS::Print("label=\"escape\", shape=septagon");
3912 break;
3913 case ActionNode::EMPTY_MATCH_CHECK:
3914 OS::Print("label=\"$%" Pd "=$pos?,$%" Pd "<%" Pd "?\", shape=septagon",
3915 that->data_.u_empty_match_check.start_register,
3916 that->data_.u_empty_match_check.repetition_register,
3917 that->data_.u_empty_match_check.repetition_limit);
3918 break;
3919 case ActionNode::CLEAR_CAPTURES: {
3920 OS::Print("label=\"clear $%" Pd " to $%" Pd "\", shape=septagon",
3921 that->data_.u_clear_captures.range_from,
3922 that->data_.u_clear_captures.range_to);
3923 break;
3924 }
3925 }
3926 OS::Print("];\n");
3927 PrintAttributes(that);
3928 RegExpNode* successor = that->on_success();
3929 OS::Print(" n%p -> n%p;\n", that, successor);
3930 Visit(successor);
3931 }
3932
3933
3934 void RegExpEngine::DotPrint(const char* label,
3935 RegExpNode* node,
3936 bool ignore_case) {
3937 DotPrinter printer(ignore_case);
3938 printer.PrintNode(label, node);
3939 }
3940
3941
3942 #endif // DEBUG
3943
3944
3945 // -------------------------------------------------------------------
3946 // Tree to graph conversion
3947
3948 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
3949 RegExpNode* on_success) {
3950 ZoneGrowableArray<TextElement>* elms =
3951 new(CI) ZoneGrowableArray<TextElement>(1);
3952 elms->Add(TextElement::Atom(this));
3953 return new(CI) TextNode(elms, on_success);
3954 }
3955
3956
3957 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
3958 RegExpNode* on_success) {
3959 ZoneGrowableArray<TextElement>* elms =
3960 new(CI) ZoneGrowableArray<TextElement>(1);
3961 for (intptr_t i = 0; i < elements()->length(); i++) {
3962 elms->Add(elements()->At(i));
3963 }
3964 return new(CI) TextNode(elms, on_success);
3965 }
3966
3967
3968 static bool CompareInverseRanges(ZoneGrowableArray<CharacterRange>* ranges,
3969 const intptr_t* special_class,
3970 intptr_t length) {
3971 length--; // Remove final 0x10000.
3972 ASSERT(special_class[length] == 0x10000);
3973 ASSERT(ranges->length() != 0);
3974 ASSERT(length != 0);
3975 ASSERT(special_class[0] != 0);
3976 if (ranges->length() != (length >> 1) + 1) {
3977 return false;
3978 }
3979 CharacterRange range = ranges->At(0);
3980 if (range.from() != 0) {
3981 return false;
3982 }
3983 for (intptr_t i = 0; i < length; i += 2) {
3984 if (special_class[i] != (range.to() + 1)) {
3985 return false;
3986 }
3987 range = ranges->At((i >> 1) + 1);
3988 if (special_class[i+1] != range.from()) {
3989 return false;
3990 }
3991 }
3992 if (range.to() != 0xffff) {
3993 return false;
3994 }
3995 return true;
3996 }
3997
3998
3999 static bool CompareRanges(ZoneGrowableArray<CharacterRange>* ranges,
4000 const intptr_t* special_class,
4001 intptr_t length) {
4002 length--; // Remove final 0x10000.
4003 ASSERT(special_class[length] == 0x10000);
4004 if (ranges->length() * 2 != length) {
4005 return false;
4006 }
4007 for (intptr_t i = 0; i < length; i += 2) {
4008 CharacterRange range = ranges->At(i >> 1);
4009 if (range.from() != special_class[i] ||
4010 range.to() != special_class[i + 1] - 1) {
4011 return false;
4012 }
4013 }
4014 return true;
4015 }
4016
4017
4018 bool RegExpCharacterClass::is_standard() {
4019 // TODO(lrn): Remove need for this function, by not throwing away information
4020 // along the way.
4021 if (is_negated_) {
4022 return false;
4023 }
4024 if (set_.is_standard()) {
4025 return true;
4026 }
4027 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
4028 set_.set_standard_set_type('s');
4029 return true;
4030 }
4031 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
4032 set_.set_standard_set_type('S');
4033 return true;
4034 }
4035 if (CompareInverseRanges(set_.ranges(),
4036 kLineTerminatorRanges,
4037 kLineTerminatorRangeCount)) {
4038 set_.set_standard_set_type('.');
4039 return true;
4040 }
4041 if (CompareRanges(set_.ranges(),
4042 kLineTerminatorRanges,
4043 kLineTerminatorRangeCount)) {
4044 set_.set_standard_set_type('n');
4045 return true;
4046 }
4047 if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
4048 set_.set_standard_set_type('w');
4049 return true;
4050 }
4051 if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
4052 set_.set_standard_set_type('W');
4053 return true;
4054 }
4055 return false;
4056 }
4057
4058
4059 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4060 RegExpNode* on_success) {
4061 return new(CI) TextNode(this, on_success);
4062 }
4063
4064
4065 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
4066 RegExpNode* on_success) {
4067 ZoneGrowableArray<RegExpTree*>* alternatives = this->alternatives();
4068 intptr_t length = alternatives->length();
4069 ChoiceNode* result =
4070 new(CI) ChoiceNode(length, CI);
4071 for (intptr_t i = 0; i < length; i++) {
4072 GuardedAlternative alternative(alternatives->At(i)->ToNode(compiler,
4073 on_success));
4074 result->AddAlternative(alternative);
4075 }
4076 return result;
4077 }
4078
4079
4080 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
4081 RegExpNode* on_success) {
4082 return ToNode(min(),
4083 max(),
4084 is_greedy(),
4085 body(),
4086 compiler,
4087 on_success);
4088 }
4089
4090
4091 // Scoped object to keep track of how much we unroll quantifier loops in the
4092 // regexp graph generator.
4093 class RegExpExpansionLimiter : public ValueObject {
4094 public:
4095 static const intptr_t kMaxExpansionFactor = 6;
4096 RegExpExpansionLimiter(RegExpCompiler* compiler, intptr_t factor)
4097 : compiler_(compiler),
4098 saved_expansion_factor_(compiler->current_expansion_factor()),
4099 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
4100 ASSERT(factor > 0);
4101 if (ok_to_expand_) {
4102 if (factor > kMaxExpansionFactor) {
4103 // Avoid integer overflow of the current expansion factor.
4104 ok_to_expand_ = false;
4105 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
4106 } else {
4107 intptr_t new_factor = saved_expansion_factor_ * factor;
4108 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
4109 compiler->set_current_expansion_factor(new_factor);
4110 }
4111 }
4112 }
4113
4114 ~RegExpExpansionLimiter() {
4115 compiler_->set_current_expansion_factor(saved_expansion_factor_);
4116 }
4117
4118 bool ok_to_expand() { return ok_to_expand_; }
4119
4120 private:
4121 RegExpCompiler* compiler_;
4122 intptr_t saved_expansion_factor_;
4123 bool ok_to_expand_;
4124
4125 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
4126 };
4127
4128
4129 RegExpNode* RegExpQuantifier::ToNode(intptr_t min,
4130 intptr_t max,
4131 bool is_greedy,
4132 RegExpTree* body,
4133 RegExpCompiler* compiler,
4134 RegExpNode* on_success,
4135 bool not_at_start) {
4136 // x{f, t} becomes this:
4137 //
4138 // (r++)<-.
4139 // | `
4140 // | (x)
4141 // v ^
4142 // (r=0)-->(?)---/ [if r < t]
4143 // |
4144 // [if r >= f] \----> ...
4145 //
4146
4147 // 15.10.2.5 RepeatMatcher algorithm.
4148 // The parser has already eliminated the case where max is 0. In the case
4149 // where max_match is zero the parser has removed the quantifier if min was
4150 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
4151
4152 // If we know that we cannot match zero length then things are a little
4153 // simpler since we don't need to make the special zero length match check
4154 // from step 2.1. If the min and max are small we can unroll a little in
4155 // this case.
4156 // Unroll (foo)+ and (foo){3,}
4157 static const intptr_t kMaxUnrolledMinMatches = 3;
4158 // Unroll (foo)? and (foo){x,3}
4159 static const intptr_t kMaxUnrolledMaxMatches = 3;
4160 if (max == 0) return on_success; // This can happen due to recursion.
4161 bool body_can_be_empty = (body->min_match() == 0);
4162 intptr_t body_start_reg = RegExpCompiler::kNoRegister;
4163 Interval capture_registers = body->CaptureRegisters();
4164 bool needs_capture_clearing = !capture_registers.is_empty();
4165 Isolate* isolate = compiler->isolate();
4166
4167 if (body_can_be_empty) {
4168 body_start_reg = compiler->AllocateRegister();
4169 } else if (kRegexpOptimization && !needs_capture_clearing) {
4170 // Only unroll if there are no captures and the body can't be
4171 // empty.
4172 {
4173 RegExpExpansionLimiter limiter(
4174 compiler, min + ((max != min) ? 1 : 0));
4175 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
4176 intptr_t new_max = (max == kInfinity) ? max : max - min;
4177 // Recurse once to get the loop or optional matches after the fixed
4178 // ones.
4179 RegExpNode* answer = ToNode(
4180 0, new_max, is_greedy, body, compiler, on_success, true);
4181 // Unroll the forced matches from 0 to min. This can cause chains of
4182 // TextNodes (which the parser does not generate). These should be
4183 // combined if it turns out they hinder good code generation.
4184 for (intptr_t i = 0; i < min; i++) {
4185 answer = body->ToNode(compiler, answer);
4186 }
4187 return answer;
4188 }
4189 }
4190 if (max <= kMaxUnrolledMaxMatches && min == 0) {
4191 ASSERT(max > 0); // Due to the 'if' above.
4192 RegExpExpansionLimiter limiter(compiler, max);
4193 if (limiter.ok_to_expand()) {
4194 // Unroll the optional matches up to max.
4195 RegExpNode* answer = on_success;
4196 for (intptr_t i = 0; i < max; i++) {
4197 ChoiceNode* alternation = new(isolate) ChoiceNode(2, isolate);
4198 if (is_greedy) {
4199 alternation->AddAlternative(
4200 GuardedAlternative(body->ToNode(compiler, answer)));
4201 alternation->AddAlternative(GuardedAlternative(on_success));
4202 } else {
4203 alternation->AddAlternative(GuardedAlternative(on_success));
4204 alternation->AddAlternative(
4205 GuardedAlternative(body->ToNode(compiler, answer)));
4206 }
4207 answer = alternation;
4208 if (not_at_start) alternation->set_not_at_start();
4209 }
4210 return answer;
4211 }
4212 }
4213 }
4214 bool has_min = min > 0;
4215 bool has_max = max < RegExpTree::kInfinity;
4216 bool needs_counter = has_min || has_max;
4217 intptr_t reg_ctr = needs_counter
4218 ? compiler->AllocateRegister()
4219 : RegExpCompiler::kNoRegister;
4220 LoopChoiceNode* center = new(isolate) LoopChoiceNode(body->min_match() == 0,
4221 isolate);
4222 if (not_at_start) center->set_not_at_start();
4223 RegExpNode* loop_return = needs_counter
4224 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
4225 : static_cast<RegExpNode*>(center);
4226 if (body_can_be_empty) {
4227 // If the body can be empty we need to check if it was and then
4228 // backtrack.
4229 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
4230 reg_ctr,
4231 min,
4232 loop_return);
4233 }
4234 RegExpNode* body_node = body->ToNode(compiler, loop_return);
4235 if (body_can_be_empty) {
4236 // If the body can be empty we need to store the start position
4237 // so we can bail out if it was empty.
4238 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
4239 }
4240 if (needs_capture_clearing) {
4241 // Before entering the body of this loop we need to clear captures.
4242 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
4243 }
4244 GuardedAlternative body_alt(body_node);
4245 if (has_max) {
4246 Guard* body_guard =
4247 new(isolate) Guard(reg_ctr, Guard::LT, max);
4248 body_alt.AddGuard(body_guard, isolate);
4249 }
4250 GuardedAlternative rest_alt(on_success);
4251 if (has_min) {
4252 Guard* rest_guard = new(isolate) Guard(reg_ctr, Guard::GEQ, min);
4253 rest_alt.AddGuard(rest_guard, isolate);
4254 }
4255 if (is_greedy) {
4256 center->AddLoopAlternative(body_alt);
4257 center->AddContinueAlternative(rest_alt);
4258 } else {
4259 center->AddContinueAlternative(rest_alt);
4260 center->AddLoopAlternative(body_alt);
4261 }
4262 if (needs_counter) {
4263 return ActionNode::SetRegister(reg_ctr, 0, center);
4264 } else {
4265 return center;
4266 }
4267 }
4268
4269
4270 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
4271 RegExpNode* on_success) {
4272 switch (assertion_type()) {
4273 case START_OF_LINE:
4274 return AssertionNode::AfterNewline(on_success);
4275 case START_OF_INPUT:
4276 return AssertionNode::AtStart(on_success);
4277 case BOUNDARY:
4278 return AssertionNode::AtBoundary(on_success);
4279 case NON_BOUNDARY:
4280 return AssertionNode::AtNonBoundary(on_success);
4281 case END_OF_INPUT:
4282 return AssertionNode::AtEnd(on_success);
4283 case END_OF_LINE: {
4284 // Compile $ in multiline regexps as an alternation with a positive
4285 // lookahead in one side and an end-of-input on the other side.
4286 // We need two registers for the lookahead.
4287 intptr_t stack_pointer_register = compiler->AllocateRegister();
4288 intptr_t position_register = compiler->AllocateRegister();
4289 // The ChoiceNode to distinguish between a newline and end-of-input.
4290 ChoiceNode* result = new ChoiceNode(2, on_success->isolate());
4291 // Create a newline atom.
4292 ZoneGrowableArray<CharacterRange>* newline_ranges =
4293 new ZoneGrowableArray<CharacterRange>(3);
4294 CharacterRange::AddClassEscape('n', newline_ranges);
4295 RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
4296 TextNode* newline_matcher = new TextNode(
4297 newline_atom,
4298 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
4299 position_register,
4300 0, // No captures inside.
4301 -1, // Ignored if no captures.
4302 on_success));
4303 // Create an end-of-input matcher.
4304 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
4305 stack_pointer_register,
4306 position_register,
4307 newline_matcher);
4308 // Add the two alternatives to the ChoiceNode.
4309 GuardedAlternative eol_alternative(end_of_line);
4310 result->AddAlternative(eol_alternative);
4311 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
4312 result->AddAlternative(end_alternative);
4313 return result;
4314 }
4315 default:
4316 UNREACHABLE();
4317 }
4318 return on_success;
4319 }
4320
4321
4322 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
4323 RegExpNode* on_success) {
4324 return new(CI)
4325 BackReferenceNode(RegExpCapture::StartRegister(index()),
4326 RegExpCapture::EndRegister(index()),
4327 on_success);
4328 }
4329
4330
4331 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
4332 RegExpNode* on_success) {
4333 return on_success;
4334 }
4335
4336
4337 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
4338 RegExpNode* on_success) {
4339 intptr_t stack_pointer_register = compiler->AllocateRegister();
4340 intptr_t position_register = compiler->AllocateRegister();
4341
4342 const intptr_t registers_per_capture = 2;
4343 const intptr_t register_of_first_capture = 2;
4344 intptr_t register_count = capture_count_ * registers_per_capture;
4345 intptr_t register_start =
4346 register_of_first_capture + capture_from_ * registers_per_capture;
4347
4348 RegExpNode* success;
4349 if (is_positive()) {
4350 RegExpNode* node = ActionNode::BeginSubmatch(
4351 stack_pointer_register,
4352 position_register,
4353 body()->ToNode(
4354 compiler,
4355 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
4356 position_register,
4357 register_count,
4358 register_start,
4359 on_success)));
4360 return node;
4361 } else {
4362 // We use a ChoiceNode for a negative lookahead because it has most of
4363 // the characteristics we need. It has the body of the lookahead as its
4364 // first alternative and the expression after the lookahead of the second
4365 // alternative. If the first alternative succeeds then the
4366 // NegativeSubmatchSuccess will unwind the stack including everything the
4367 // choice node set up and backtrack. If the first alternative fails then
4368 // the second alternative is tried, which is exactly the desired result
4369 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
4370 // ChoiceNode that knows to ignore the first exit when calculating quick
4371 // checks.
4372
4373 GuardedAlternative body_alt(
4374 body()->ToNode(
4375 compiler,
4376 success = new(CI) NegativeSubmatchSuccess(stack_pointer_register,
4377 position_register,
4378 register_count,
4379 register_start,
4380 CI)));
4381 ChoiceNode* choice_node =
4382 new(CI) NegativeLookaheadChoiceNode(body_alt,
4383 GuardedAlternative(on_success),
4384 CI);
4385 return ActionNode::BeginSubmatch(stack_pointer_register,
4386 position_register,
4387 choice_node);
4388 }
4389 }
4390
4391
4392 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
4393 RegExpNode* on_success) {
4394 return ToNode(body(), index(), compiler, on_success);
4395 }
4396
4397
4398 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
4399 intptr_t index,
4400 RegExpCompiler* compiler,
4401 RegExpNode* on_success) {
4402 intptr_t start_reg = RegExpCapture::StartRegister(index);
4403 intptr_t end_reg = RegExpCapture::EndRegister(index);
4404 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
4405 RegExpNode* body_node = body->ToNode(compiler, store_end);
4406 return ActionNode::StorePosition(start_reg, true, body_node);
4407 }
4408
4409
4410 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
4411 RegExpNode* on_success) {
4412 ZoneGrowableArray<RegExpTree*>* children = nodes();
4413 RegExpNode* current = on_success;
4414 for (intptr_t i = children->length() - 1; i >= 0; i--) {
4415 current = children->At(i)->ToNode(compiler, current);
4416 }
4417 return current;
4418 }
4419
4420
4421 static void AddClass(const intptr_t* elmv,
4422 intptr_t elmc,
4423 ZoneGrowableArray<CharacterRange>* ranges) {
4424 elmc--;
4425 ASSERT(elmv[elmc] == 0x10000);
4426 for (intptr_t i = 0; i < elmc; i += 2) {
4427 ASSERT(elmv[i] < elmv[i + 1]);
4428 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1));
4429 }
4430 }
4431
4432
4433 static void AddClassNegated(const intptr_t *elmv,
4434 intptr_t elmc,
4435 ZoneGrowableArray<CharacterRange>* ranges) {
4436 elmc--;
4437 ASSERT(elmv[elmc] == 0x10000);
4438 ASSERT(elmv[0] != 0x0000);
4439 ASSERT(elmv[elmc-1] != Utf16::kMaxCodeUnit);
4440 uint16_t last = 0x0000;
4441 for (intptr_t i = 0; i < elmc; i += 2) {
4442 ASSERT(last <= elmv[i] - 1);
4443 ASSERT(elmv[i] < elmv[i + 1]);
4444 ranges->Add(CharacterRange(last, elmv[i] - 1));
4445 last = elmv[i + 1];
4446 }
4447 ranges->Add(CharacterRange(last, Utf16::kMaxCodeUnit));
4448 }
4449
4450
4451 void CharacterRange::AddClassEscape(uint16_t type,
4452 ZoneGrowableArray<CharacterRange>* ranges) {
4453 switch (type) {
4454 case 's':
4455 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
4456 break;
4457 case 'S':
4458 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
4459 break;
4460 case 'w':
4461 AddClass(kWordRanges, kWordRangeCount, ranges);
4462 break;
4463 case 'W':
4464 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
4465 break;
4466 case 'd':
4467 AddClass(kDigitRanges, kDigitRangeCount, ranges);
4468 break;
4469 case 'D':
4470 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
4471 break;
4472 case '.':
4473 AddClassNegated(kLineTerminatorRanges,
4474 kLineTerminatorRangeCount,
4475 ranges);
4476 break;
4477 // This is not a character range as defined by the spec but a
4478 // convenient shorthand for a character class that matches any
4479 // character.
4480 case '*':
4481 ranges->Add(CharacterRange::Everything());
4482 break;
4483 // This is the set of characters matched by the $ and ^ symbols
4484 // in multiline mode.
4485 case 'n':
4486 AddClass(kLineTerminatorRanges,
4487 kLineTerminatorRangeCount,
4488 ranges);
4489 break;
4490 default:
4491 UNREACHABLE();
4492 }
4493 }
4494
4495
4496 void CharacterRange::AddCaseEquivalents(
4497 ZoneGrowableArray<CharacterRange>* ranges,
4498 bool is_one_byte,
4499 Isolate* isolate) {
4500 uint16_t bottom = from();
4501 uint16_t top = to();
4502 if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) {
4503 if (bottom > Symbols::kMaxOneCharCodeSymbol) return;
4504 if (top > Symbols::kMaxOneCharCodeSymbol) {
4505 top = Symbols::kMaxOneCharCodeSymbol;
4506 }
4507 }
4508
4509 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize;
4510 unibrow::Mapping<unibrow::CanonicalizationRange> jsregexp_canonrange;
4511 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4512 if (top == bottom) {
4513 // If this is a singleton we just expand the one character.
4514 intptr_t length = jsregexp_uncanonicalize.get(bottom, '\0', chars); // NOLIN T
4515 for (intptr_t i = 0; i < length; i++) {
4516 uint32_t chr = chars[i];
4517 if (chr != bottom) {
4518 ranges->Add(CharacterRange::Singleton(chars[i]));
4519 }
4520 }
4521 } else {
4522 // If this is a range we expand the characters block by block,
4523 // expanding contiguous subranges (blocks) one at a time.
4524 // The approach is as follows. For a given start character we
4525 // look up the remainder of the block that contains it (represented
4526 // by the end point), for instance we find 'z' if the character
4527 // is 'c'. A block is characterized by the property
4528 // that all characters uncanonicalize in the same way, except that
4529 // each entry in the result is incremented by the distance from the first
4530 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
4531 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
4532 // Once we've found the end point we look up its uncanonicalization
4533 // and produce a range for each element. For instance for [c-f]
4534 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
4535 // add a range if it is not already contained in the input, so [c-f]
4536 // will be skipped but [C-F] will be added. If this range is not
4537 // completely contained in a block we do this for all the blocks
4538 // covered by the range (handling characters that is not in a block
4539 // as a "singleton block").
4540 int32_t range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4541 intptr_t pos = bottom;
4542 while (pos <= top) {
4543 intptr_t length = jsregexp_canonrange.get(pos, '\0', range);
4544 uint16_t block_end;
4545 if (length == 0) {
4546 block_end = pos;
4547 } else {
4548 ASSERT(length == 1);
4549 block_end = range[0];
4550 }
4551 intptr_t end = (block_end > top) ? top : block_end;
4552 length = jsregexp_uncanonicalize.get(block_end, '\0', range); // NOLINT
4553 for (intptr_t i = 0; i < length; i++) {
4554 uint32_t c = range[i];
4555 uint16_t range_from = c - (block_end - pos);
4556 uint16_t range_to = c - (block_end - end);
4557 if (!(bottom <= range_from && range_to <= top)) {
4558 ranges->Add(CharacterRange(range_from, range_to));
4559 }
4560 }
4561 pos = end + 1;
4562 }
4563 }
4564 }
4565
4566
4567 bool CharacterRange::IsCanonical(ZoneGrowableArray<CharacterRange>* ranges) {
4568 ASSERT(ranges != NULL);
4569 intptr_t n = ranges->length();
4570 if (n <= 1) return true;
4571 intptr_t max = ranges->At(0).to();
4572 for (intptr_t i = 1; i < n; i++) {
4573 CharacterRange next_range = ranges->At(i);
4574 if (next_range.from() <= max + 1) return false;
4575 max = next_range.to();
4576 }
4577 return true;
4578 }
4579
4580
4581 ZoneGrowableArray<CharacterRange>* CharacterSet::ranges() {
4582 if (ranges_ == NULL) {
4583 ranges_ = new ZoneGrowableArray<CharacterRange>(2);
4584 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4585 }
4586 return ranges_;
4587 }
4588
4589
4590 // Move a number of elements in a zone array to another position
4591 // in the same array. Handles overlapping source and target areas.
4592 static void MoveRanges(ZoneGrowableArray<CharacterRange>* list,
4593 intptr_t from,
4594 intptr_t to,
4595 intptr_t count) {
4596 // Ranges are potentially overlapping.
4597 if (from < to) {
4598 for (intptr_t i = count - 1; i >= 0; i--) {
4599 (*list)[to + i] = list->At(from + i);
4600 }
4601 } else {
4602 for (intptr_t i = 0; i < count; i++) {
4603 (*list)[to + i] = list->At(from + i);
4604 }
4605 }
4606 }
4607
4608
4609 static intptr_t InsertRangeInCanonicalList(
4610 ZoneGrowableArray<CharacterRange>* list,
4611 intptr_t count,
4612 CharacterRange insert) {
4613 // Inserts a range into list[0..count[, which must be sorted
4614 // by from value and non-overlapping and non-adjacent, using at most
4615 // list[0..count] for the result. Returns the number of resulting
4616 // canonicalized ranges. Inserting a range may collapse existing ranges into
4617 // fewer ranges, so the return value can be anything in the range 1..count+1.
4618 uint16_t from = insert.from();
4619 uint16_t to = insert.to();
4620 intptr_t start_pos = 0;
4621 intptr_t end_pos = count;
4622 for (intptr_t i = count - 1; i >= 0; i--) {
4623 CharacterRange current = list->At(i);
4624 if (current.from() > to + 1) {
4625 end_pos = i;
4626 } else if (current.to() + 1 < from) {
4627 start_pos = i + 1;
4628 break;
4629 }
4630 }
4631
4632 // Inserted range overlaps, or is adjacent to, ranges at positions
4633 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
4634 // not affected by the insertion.
4635 // If start_pos == end_pos, the range must be inserted before start_pos.
4636 // if start_pos < end_pos, the entire range from start_pos to end_pos
4637 // must be merged with the insert range.
4638
4639 if (start_pos == end_pos) {
4640 // Insert between existing ranges at position start_pos.
4641 if (start_pos < count) {
4642 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
4643 }
4644 (*list)[start_pos] = insert;
4645 return count + 1;
4646 }
4647 if (start_pos + 1 == end_pos) {
4648 // Replace single existing range at position start_pos.
4649 CharacterRange to_replace = list->At(start_pos);
4650 intptr_t new_from = Utils::Minimum(to_replace.from(), from);
4651 intptr_t new_to = Utils::Maximum(to_replace.to(), to);
4652 (*list)[start_pos] = CharacterRange(new_from, new_to);
4653 return count;
4654 }
4655 // Replace a number of existing ranges from start_pos to end_pos - 1.
4656 // Move the remaining ranges down.
4657
4658 intptr_t new_from = Utils::Minimum(list->At(start_pos).from(), from);
4659 intptr_t new_to = Utils::Maximum(list->At(end_pos - 1).to(), to);
4660 if (end_pos < count) {
4661 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
4662 }
4663 (*list)[start_pos] = CharacterRange(new_from, new_to);
4664 return count - (end_pos - start_pos) + 1;
4665 }
4666
4667
4668 void CharacterSet::Canonicalize() {
4669 // Special/default classes are always considered canonical. The result
4670 // of calling ranges() will be sorted.
4671 if (ranges_ == NULL) return;
4672 CharacterRange::Canonicalize(ranges_);
4673 }
4674
4675
4676 void CharacterRange::Canonicalize(
4677 ZoneGrowableArray<CharacterRange>* character_ranges) {
4678 if (character_ranges->length() <= 1) return;
4679 // Check whether ranges are already canonical (increasing, non-overlapping,
4680 // non-adjacent).
4681 intptr_t n = character_ranges->length();
4682 intptr_t max = character_ranges->At(0).to();
4683 intptr_t i = 1;
4684 while (i < n) {
4685 CharacterRange current = character_ranges->At(i);
4686 if (current.from() <= max + 1) {
4687 break;
4688 }
4689 max = current.to();
4690 i++;
4691 }
4692 // Canonical until the i'th range. If that's all of them, we are done.
4693 if (i == n) return;
4694
4695 // The ranges at index i and forward are not canonicalized. Make them so by
4696 // doing the equivalent of insertion sort (inserting each into the previous
4697 // list, in order).
4698 // Notice that inserting a range can reduce the number of ranges in the
4699 // result due to combining of adjacent and overlapping ranges.
4700 intptr_t read = i; // Range to insert.
4701 intptr_t num_canonical = i; // Length of canonicalized part of list.
4702 do {
4703 num_canonical = InsertRangeInCanonicalList(character_ranges,
4704 num_canonical,
4705 character_ranges->At(read));
4706 read++;
4707 } while (read < n);
4708 character_ranges->TruncateTo(num_canonical);
4709
4710 ASSERT(CharacterRange::IsCanonical(character_ranges));
4711 }
4712
4713
4714 void CharacterRange::Negate(ZoneGrowableArray<CharacterRange>* ranges,
4715 ZoneGrowableArray<CharacterRange>* negated_ranges) {
4716 ASSERT(CharacterRange::IsCanonical(ranges));
4717 ASSERT(negated_ranges->length() == 0);
4718 intptr_t range_count = ranges->length();
4719 uint16_t from = 0;
4720 intptr_t i = 0;
4721 if (range_count > 0 && ranges->At(0).from() == 0) {
4722 from = ranges->At(0).to();
4723 i = 1;
4724 }
4725 while (i < range_count) {
4726 CharacterRange range = ranges->At(i);
4727 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
4728 from = range.to();
4729 i++;
4730 }
4731 if (from < Utf16::kMaxCodeUnit) {
4732 negated_ranges->Add(CharacterRange(from + 1, Utf16::kMaxCodeUnit));
4733 }
4734 }
4735
4736
4737 // -------------------------------------------------------------------
4738 // Splay tree
4739
4740
4741 // Workaround for the fact that ZoneGrowableArray does not have contains().
4742 static bool ArrayContains(ZoneGrowableArray<unsigned>* array,
4743 unsigned value) {
4744 for (intptr_t i = 0; i < array->length(); i++) {
4745 if (array->At(i) == value) {
4746 return true;
4747 }
4748 }
4749 return false;
4750 }
4751
4752
4753 void OutSet::Set(unsigned value, Isolate* isolate) {
4754 if (value < kFirstLimit) {
4755 first_ |= (1 << value);
4756 } else {
4757 if (remaining_ == NULL)
4758 remaining_ = new(isolate) ZoneGrowableArray<unsigned>(1);
4759
4760 bool remaining_contains_value = ArrayContains(remaining_, value);
4761 if (remaining_->is_empty() || !remaining_contains_value) {
4762 remaining_->Add(value);
4763 }
4764 }
4765 }
4766
4767
4768 bool OutSet::Get(unsigned value) const {
4769 if (value < kFirstLimit) {
4770 return (first_ & (1 << value)) != 0;
4771 } else if (remaining_ == NULL) {
4772 return false;
4773 } else {
4774 return ArrayContains(remaining_, value);
4775 }
4776 }
4777
4778
4779 // -------------------------------------------------------------------
4780 // Analysis
4781
4782
4783 void Analysis::EnsureAnalyzed(RegExpNode* that) {
4784 if (that->info()->been_analyzed || that->info()->being_analyzed)
4785 return;
4786 that->info()->being_analyzed = true;
4787 that->Accept(this);
4788 that->info()->being_analyzed = false;
4789 that->info()->been_analyzed = true;
4790 }
4791
4792
4793 void Analysis::VisitEnd(EndNode* that) {
4794 // nothing to do
4795 }
4796
4797
4798 void TextNode::CalculateOffsets() {
4799 intptr_t element_count = elements()->length();
4800 // Set up the offsets of the elements relative to the start. This is a fixed
4801 // quantity since a TextNode can only contain fixed-width things.
4802 intptr_t cp_offset = 0;
4803 for (intptr_t i = 0; i < element_count; i++) {
4804 TextElement& elm = (*elements())[i];
4805 elm.set_cp_offset(cp_offset);
4806 cp_offset += elm.length();
4807 }
4808 }
4809
4810
4811 void Analysis::VisitText(TextNode* that) {
4812 if (ignore_case_) {
4813 that->MakeCaseIndependent(is_one_byte_);
4814 }
4815 EnsureAnalyzed(that->on_success());
4816 if (!has_failed()) {
4817 that->CalculateOffsets();
4818 }
4819 }
4820
4821
4822 void Analysis::VisitAction(ActionNode* that) {
4823 RegExpNode* target = that->on_success();
4824 EnsureAnalyzed(target);
4825 if (!has_failed()) {
4826 // If the next node is interested in what it follows then this node
4827 // has to be interested too so it can pass the information on.
4828 that->info()->AddFromFollowing(target->info());
4829 }
4830 }
4831
4832
4833 void Analysis::VisitChoice(ChoiceNode* that) {
4834 NodeInfo* info = that->info();
4835 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
4836 RegExpNode* node = (*that->alternatives())[i].node();
4837 EnsureAnalyzed(node);
4838 if (has_failed()) return;
4839 // Anything the following nodes need to know has to be known by
4840 // this node also, so it can pass it on.
4841 info->AddFromFollowing(node->info());
4842 }
4843 }
4844
4845
4846 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
4847 NodeInfo* info = that->info();
4848 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
4849 RegExpNode* node = (*that->alternatives())[i].node();
4850 if (node != that->loop_node()) {
4851 EnsureAnalyzed(node);
4852 if (has_failed()) return;
4853 info->AddFromFollowing(node->info());
4854 }
4855 }
4856 // Check the loop last since it may need the value of this node
4857 // to get a correct result.
4858 EnsureAnalyzed(that->loop_node());
4859 if (!has_failed()) {
4860 info->AddFromFollowing(that->loop_node()->info());
4861 }
4862 }
4863
4864
4865 void Analysis::VisitBackReference(BackReferenceNode* that) {
4866 EnsureAnalyzed(that->on_success());
4867 }
4868
4869
4870 void Analysis::VisitAssertion(AssertionNode* that) {
4871 EnsureAnalyzed(that->on_success());
4872 }
4873
4874
4875 void BackReferenceNode::FillInBMInfo(intptr_t offset,
4876 intptr_t budget,
4877 BoyerMooreLookahead* bm,
4878 bool not_at_start) {
4879 // Working out the set of characters that a backreference can match is too
4880 // hard, so we just say that any character can match.
4881 bm->SetRest(offset);
4882 SaveBMInfo(bm, not_at_start, offset);
4883 }
4884
4885
4886 COMPILE_ASSERT(BoyerMoorePositionInfo::kMapSize ==
4887 RegExpMacroAssembler::kTableSize);
4888
4889
4890 void ChoiceNode::FillInBMInfo(intptr_t offset,
4891 intptr_t budget,
4892 BoyerMooreLookahead* bm,
4893 bool not_at_start) {
4894 ZoneGrowableArray<GuardedAlternative>* alts = alternatives();
4895 budget = (budget - 1) / alts->length();
4896 for (intptr_t i = 0; i < alts->length(); i++) {
4897 GuardedAlternative& alt = (*alts)[i];
4898 if (alt.guards() != NULL && alt.guards()->length() != 0) {
4899 bm->SetRest(offset); // Give up trying to fill in info.
4900 SaveBMInfo(bm, not_at_start, offset);
4901 return;
4902 }
4903 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start);
4904 }
4905 SaveBMInfo(bm, not_at_start, offset);
4906 }
4907
4908
4909 void TextNode::FillInBMInfo(intptr_t initial_offset,
4910 intptr_t budget,
4911 BoyerMooreLookahead* bm,
4912 bool not_at_start) {
4913 if (initial_offset >= bm->length()) return;
4914 intptr_t offset = initial_offset;
4915 intptr_t max_char = bm->max_char();
4916 for (intptr_t i = 0; i < elements()->length(); i++) {
4917 if (offset >= bm->length()) {
4918 if (initial_offset == 0) set_bm_info(not_at_start, bm);
4919 return;
4920 }
4921 TextElement text = elements()->At(i);
4922 if (text.text_type() == TextElement::ATOM) {
4923 RegExpAtom* atom = text.atom();
4924 for (intptr_t j = 0; j < atom->length(); j++, offset++) {
4925 if (offset >= bm->length()) {
4926 if (initial_offset == 0) set_bm_info(not_at_start, bm);
4927 return;
4928 }
4929 uint16_t character = atom->data()->At(j);
4930 if (bm->compiler()->ignore_case()) {
4931 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4932 intptr_t length = GetCaseIndependentLetters(
4933 character,
4934 bm->max_char() == Symbols::kMaxOneCharCodeSymbol,
4935 chars);
4936 for (intptr_t j = 0; j < length; j++) {
4937 bm->Set(offset, chars[j]);
4938 }
4939 } else {
4940 if (character <= max_char) bm->Set(offset, character);
4941 }
4942 }
4943 } else {
4944 ASSERT(text.text_type() == TextElement::CHAR_CLASS);
4945 RegExpCharacterClass* char_class = text.char_class();
4946 ZoneGrowableArray<CharacterRange>* ranges = char_class->ranges();
4947 if (char_class->is_negated()) {
4948 bm->SetAll(offset);
4949 } else {
4950 for (intptr_t k = 0; k < ranges->length(); k++) {
4951 CharacterRange& range = (*ranges)[k];
4952 if (range.from() > max_char) continue;
4953 intptr_t to = Utils::Minimum(max_char,
4954 static_cast<intptr_t>(range.to()));
4955 bm->SetInterval(offset, Interval(range.from(), to));
4956 }
4957 }
4958 offset++;
4959 }
4960 }
4961 if (offset >= bm->length()) {
4962 if (initial_offset == 0) set_bm_info(not_at_start, bm);
4963 return;
4964 }
4965 on_success()->FillInBMInfo(offset,
4966 budget - 1,
4967 bm,
4968 true); // Not at start after a text node.
4969 if (initial_offset == 0) set_bm_info(not_at_start, bm);
4970 }
4971
4972
4973 RegExpEngine::CompilationResult RegExpEngine::Compile(
4974 RegExpCompileData* data,
4975 const ParsedFunction* parsed_function,
4976 const ZoneGrowableArray<const ICData*>& ic_data_array) {
4977 Isolate* isolate = Isolate::Current();
4978
4979 const Function& function = parsed_function->function();
4980 const intptr_t specialization_cid = function.regexp_cid();
4981 const bool is_one_byte = (specialization_cid == kOneByteStringCid ||
4982 specialization_cid == kExternalOneByteStringCid);
4983 JSRegExp& regexp = JSRegExp::Handle(isolate, function.regexp());
4984 const String& pattern = String::Handle(isolate, regexp.pattern());
4985
4986 ASSERT(!regexp.IsNull());
4987 ASSERT(!pattern.IsNull());
4988
4989 const bool ignore_case = regexp.is_ignore_case();
4990 const bool is_global = regexp.is_global();
4991
4992 RegExpCompiler compiler(data->capture_count, ignore_case, specialization_cid);
4993
4994 // TODO(zerny): Frequency sampling is currently disabled because of several
4995 // issues. We do not want to store subject strings in the regexp object since
4996 // they might be long and we should not prevent their garbage collection.
4997 // Passing them to this function explicitly does not help, since we must
4998 // generate exactly the same IR for both the unoptimizing and optimizing
4999 // pipelines (otherwise it gets confused when i.e. deopt id's differ).
5000 // An option would be to store sampling results in the regexp object, but
5001 // I'm not sure the performance gains are relevant enough.
5002
5003 // Wrap the body of the regexp in capture #0.
5004 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
5005 0,
5006 &compiler,
5007 compiler.accept());
5008
5009 RegExpNode* node = captured_body;
5010 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
5011 bool is_start_anchored = data->tree->IsAnchoredAtStart();
5012 intptr_t max_length = data->tree->max_match();
5013 if (!is_start_anchored) {
5014 // Add a .*? at the beginning, outside the body capture, unless
5015 // this expression is anchored at the beginning.
5016 RegExpNode* loop_node =
5017 RegExpQuantifier::ToNode(0,
5018 RegExpTree::kInfinity,
5019 false,
5020 new(isolate) RegExpCharacterClass('*'),
5021 &compiler,
5022 captured_body,
5023 data->contains_anchor);
5024
5025 if (data->contains_anchor) {
5026 // Unroll loop once, to take care of the case that might start
5027 // at the start of input.
5028 ChoiceNode* first_step_node = new(isolate) ChoiceNode(2, isolate);
5029 first_step_node->AddAlternative(GuardedAlternative(captured_body));
5030 first_step_node->AddAlternative(GuardedAlternative(
5031 new(isolate) TextNode(
5032 new(isolate) RegExpCharacterClass('*'), loop_node)));
5033 node = first_step_node;
5034 } else {
5035 node = loop_node;
5036 }
5037 }
5038 if (is_one_byte) {
5039 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
5040 // Do it again to propagate the new nodes to places where they were not
5041 // put because they had not been calculated yet.
5042 if (node != NULL) {
5043 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
5044 }
5045 }
5046
5047 if (node == NULL) node = new(isolate) EndNode(EndNode::BACKTRACK, isolate);
5048 data->node = node;
5049 Analysis analysis(ignore_case, is_one_byte);
5050 analysis.EnsureAnalyzed(node);
5051 if (analysis.has_failed()) {
5052 const char* error_message = analysis.error_message();
5053 return CompilationResult(error_message);
5054 }
5055
5056 // Native regexp implementation.
5057
5058 IRRegExpMacroAssembler* macro_assembler =
5059 new(isolate) IRRegExpMacroAssembler(specialization_cid,
5060 data->capture_count,
5061 parsed_function,
5062 ic_data_array,
5063 isolate);
5064
5065 // Inserted here, instead of in Assembler, because it depends on information
5066 // in the AST that isn't replicated in the Node structure.
5067 static const intptr_t kMaxBacksearchLimit = 1024;
5068 if (is_end_anchored &&
5069 !is_start_anchored &&
5070 max_length < kMaxBacksearchLimit) {
5071 macro_assembler->SetCurrentPositionFromEnd(max_length);
5072 }
5073
5074 if (is_global) {
5075 macro_assembler->set_global_mode(
5076 (data->tree->min_match() > 0)
5077 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK
5078 : RegExpMacroAssembler::GLOBAL);
5079 }
5080
5081 RegExpEngine::CompilationResult result =
5082 compiler.Assemble(macro_assembler,
5083 node,
5084 data->capture_count,
5085 pattern);
5086
5087 if (FLAG_trace_irregexp) {
5088 macro_assembler->PrintBlocks();
5089 }
5090
5091 return result;
5092 }
5093
5094
5095 static void CreateSpecializedFunction(Isolate* isolate,
5096 const JSRegExp& regexp,
5097 intptr_t specialization_cid,
5098 const Object& owner) {
5099 const intptr_t kParamCount = RegExpMacroAssembler::kParamCount;
5100
5101 Function& fn = Function::Handle(isolate,
5102 Function::New(String::Handle(isolate, Symbols::New("RegExp")),
5103 RawFunction::kIrregexpFunction,
5104 true, // Static.
5105 false, // Not const.
5106 false, // Not abstract.
5107 false, // Not external.
5108 false, // Not native.
5109 owner,
5110 0)); // Requires a non-negative token position.
5111
5112 // TODO(zerny): Share these arrays between all irregexp functions.
5113 fn.set_num_fixed_parameters(kParamCount);
5114 fn.set_parameter_types(Array::Handle(isolate, Array::New(kParamCount,
5115 Heap::kOld)));
5116 fn.set_parameter_names(Array::Handle(isolate, Array::New(kParamCount,
5117 Heap::kOld)));
5118 fn.SetParameterTypeAt(0, Type::Handle(isolate, Type::DynamicType()));
5119 fn.SetParameterNameAt(0, Symbols::string_param_());
5120 fn.SetParameterTypeAt(1, Type::Handle(isolate, Type::DynamicType()));
5121 fn.SetParameterNameAt(1, Symbols::start_index_param_());
5122 fn.set_result_type(Type::Handle(isolate, Type::ArrayType()));
5123
5124 // Cache the result.
5125 regexp.set_function(specialization_cid, fn);
5126
5127 fn.set_regexp(regexp);
5128 fn.set_regexp_cid(specialization_cid);
5129
5130 // The function is compiled lazily during the first call.
5131 }
5132
5133
5134 RawJSRegExp* RegExpEngine::CreateJSRegExp(Isolate* isolate,
5135 const String& pattern,
5136 bool multi_line,
5137 bool ignore_case) {
5138 const JSRegExp& regexp = JSRegExp::Handle(JSRegExp::New(0));
5139
5140 regexp.set_pattern(pattern);
5141
5142 if (multi_line) {
5143 regexp.set_is_multi_line();
5144 }
5145 if (ignore_case) {
5146 regexp.set_is_ignore_case();
5147 }
5148
5149 // TODO(zerny): We might want to use normal string searching algorithms
5150 // for simple patterns.
5151 regexp.set_is_complex();
5152 regexp.set_is_global(); // All dart regexps are global.
5153
5154 const Library& lib = Library::Handle(isolate, Library::CoreLibrary());
5155 const Class& owner = Class::Handle(isolate,
5156 lib.LookupClass(String::Handle(isolate, Symbols::New("RegExp"))));
5157
5158 CreateSpecializedFunction(isolate, regexp, kOneByteStringCid, owner);
5159 CreateSpecializedFunction(isolate, regexp, kTwoByteStringCid, owner);
5160 CreateSpecializedFunction(isolate, regexp, kExternalOneByteStringCid, owner);
5161 CreateSpecializedFunction(isolate, regexp, kExternalTwoByteStringCid, owner);
5162
5163 return regexp.raw();
5164 }
5165
5166
5167 } // namespace dart
OLDNEW
« runtime/vm/object.cc ('K') | « runtime/vm/regexp.h ('k') | runtime/vm/regexp_assembler.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698