runtime/vm/regexp.cc - Issue 683433003: Integrate the Irregexp Regular Expression Engine.

Side by Side Diff: runtime/vm/regexp.cc

Issue 683433003: Integrate the Irregexp Regular Expression Engine. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: rebase, enable tests, remove *CodeUnitsAt Created 6 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file

	2 // for details. All rights reserved. Use of this source code is governed by a

	3 // BSD-style license that can be found in the LICENSE file.

	4

	5 #include "vm/regexp.h"

	6

	7 #include "vm/dart_entry.h"

	8 #include "vm/regexp_assembler.h"

	9 #include "vm/regexp_ast.h"

	10 #include "vm/unibrow-inl.h"

	11 #include "vm/unicode.h"

	12 #include "vm/symbols.h"

	13

	14 #define I (isolate())

	15 #define CI (compiler->isolate())

	16

	17 namespace dart {

	18

	19 DECLARE_FLAG(bool, trace_irregexp);

	20

	21 // Default to generating optimized regexp code.

	22 static const bool kRegexpOptimization = true;

	23

	24 // More makes code generation slower, less makes V8 benchmark score lower.

	25 static const intptr_t kMaxLookaheadForBoyerMoore = 8;

	26

	27 ContainedInLattice AddRange(ContainedInLattice containment,

	28 const intptr_t* ranges,

	29 intptr_t ranges_length,

	30 Interval new_range) {

	31 ASSERT((ranges_length & 1) == 1);

	32 ASSERT(ranges[ranges_length - 1] == Utf16::kMaxCodeUnit + 1);

	33 if (containment == kLatticeUnknown) return containment;

	34 bool inside = false;

	35 intptr_t last = 0;

	36 for (intptr_t i = 0; i < ranges_length;

	37 inside = !inside, last = ranges[i], i++) {

	38 // Consider the range from last to ranges[i].

	39 // We haven't got to the new range yet.

	40 if (ranges[i] <= new_range.from()) continue;

	41 // New range is wholly inside last-ranges[i]. Note that new_range.to() is

	42 // inclusive, but the values in ranges are not.

	43 if (last <= new_range.from() && new_range.to() < ranges[i]) {

	44 return Combine(containment, inside ? kLatticeIn : kLatticeOut);

	45 }

	46 return kLatticeUnknown;

	47 }

	48 return containment;

	49 }

	50

	51 // -------------------------------------------------------------------

	52 // Implementation of the Irregexp regular expression engine.

	53 //

	54 // The Irregexp regular expression engine is intended to be a complete

	55 // implementation of ECMAScript regular expressions. It generates

	56 // IR code that is subsequently compiled to native code.

	57

	58 // The Irregexp regexp engine is structured in three steps.

	59 // 1) The parser generates an abstract syntax tree. See regexp_ast.cc.

	60 // 2) From the AST a node network is created. The nodes are all

	61 // subclasses of RegExpNode. The nodes represent states when

	62 // executing a regular expression. Several optimizations are

	63 // performed on the node network.

	64 // 3) From the nodes we generate IR instructions that can actually

	65 // execute the regular expression (perform the search). The

	66 // code generation step is described in more detail below.

	67

	68 // Code generation.

	69 //

	70 // The nodes are divided into four main categories.

	71 // * Choice nodes

	72 // These represent places where the regular expression can

	73 // match in more than one way. For example on entry to an

	74 // alternation (foo\|bar) or a repetition (*, +, ? or {}).

	75 // * Action nodes

	76 // These represent places where some action should be

	77 // performed. Examples include recording the current position

	78 // in the input string to a register (in order to implement

	79 // captures) or other actions on register for example in order

	80 // to implement the counters needed for {} repetitions.

	81 // * Matching nodes

	82 // These attempt to match some element part of the input string.

	83 // Examples of elements include character classes, plain strings

	84 // or back references.

	85 // * End nodes

	86 // These are used to implement the actions required on finding

	87 // a successful match or failing to find a match.

	88 //

	89 // The code generated maintains some state as it runs. This consists of the

	90 // following elements:

	91 //

	92 // * The capture registers. Used for string captures.

	93 // * Other registers. Used for counters etc.

	94 // * The current position.

	95 // * The stack of backtracking information. Used when a matching node

	96 // fails to find a match and needs to try an alternative.

	97 //

	98 // Conceptual regular expression execution model:

	99 //

	100 // There is a simple conceptual model of regular expression execution

	101 // which will be presented first. The actual code generated is a more

	102 // efficient simulation of the simple conceptual model:

	103 //

	104 // * Choice nodes are implemented as follows:

	105 // For each choice except the last {

	106 // push current position

	107 // push backtrack code location

	108 // <generate code to test for choice>

	109 // backtrack code location:

	110 // pop current position

	111 // }

	112 // <generate code to test for last choice>

	113 //

	114 // * Actions nodes are generated as follows

	115 // <push affected registers on backtrack stack>

	116 // <generate code to perform action>

	117 // push backtrack code location

	118 // <generate code to test for following nodes>

	119 // backtrack code location:

	120 // <pop affected registers to restore their state>

	121 // <pop backtrack location from stack and go to it>

	122 //

	123 // * Matching nodes are generated as follows:

	124 // if input string matches at current position

	125 // update current position

	126 // <generate code to test for following nodes>

	127 // else

	128 // <pop backtrack location from stack and go to it>

	129 //

	130 // Thus it can be seen that the current position is saved and restored

	131 // by the choice nodes, whereas the registers are saved and restored by

	132 // by the action nodes that manipulate them.

	133 //

	134 // The other interesting aspect of this model is that nodes are generated

	135 // at the point where they are needed by a recursive call to Emit(). If

	136 // the node has already been code generated then the Emit() call will

	137 // generate a jump to the previously generated code instead. In order to

	138 // limit recursion it is possible for the Emit() function to put the node

	139 // on a work list for later generation and instead generate a jump. The

	140 // destination of the jump is resolved later when the code is generated.

	141 //

	142 // Actual regular expression code generation.

	143 //

	144 // Code generation is actually more complicated than the above. In order

	145 // to improve the efficiency of the generated code some optimizations are

	146 // performed

	147 //

	148 // * Choice nodes have 1-character lookahead.

	149 // A choice node looks at the following character and eliminates some of

	150 // the choices immediately based on that character. This is not yet

	151 // implemented.

	152 // * Simple greedy loops store reduced backtracking information.

	153 // A quantifier like /.*foo/m will greedily match the whole input. It will

	154 // then need to backtrack to a point where it can match "foo". The naive

	155 // implementation of this would push each character position onto the

	156 // backtracking stack, then pop them off one by one. This would use space

	157 // proportional to the length of the input string. However since the "."

	158 // can only match in one way and always has a constant length (in this case

	159 // of 1) it suffices to store the current position on the top of the stack

	160 // once. Matching now becomes merely incrementing the current position and

	161 // backtracking becomes decrementing the current position and checking the

	162 // result against the stored current position. This is faster and saves

	163 // space.

	164 // * The current state is virtualized.

	165 // This is used to defer expensive operations until it is clear that they

	166 // are needed and to generate code for a node more than once, allowing

	167 // specialized an efficient versions of the code to be created. This is

	168 // explained in the section below.

	169 //

	170 // Execution state virtualization.

	171 //

	172 // Instead of emitting code, nodes that manipulate the state can record their

	173 // manipulation in an object called the Trace. The Trace object can record a

	174 // current position offset, an optional backtrack code location on the top of

	175 // the virtualized backtrack stack and some register changes. When a node is

	176 // to be emitted it can flush the Trace or update it. Flushing the Trace

	177 // will emit code to bring the actual state into line with the virtual state.

	178 // Avoiding flushing the state can postpone some work (e.g. updates of capture

	179 // registers). Postponing work can save time when executing the regular

	180 // expression since it may be found that the work never has to be done as a

	181 // failure to match can occur. In addition it is much faster to jump to a

	182 // known backtrack code location than it is to pop an unknown backtrack

	183 // location from the stack and jump there.

	184 //

	185 // The virtual state found in the Trace affects code generation. For example

	186 // the virtual state contains the difference between the actual current

	187 // position and the virtual current position, and matching code needs to use

	188 // this offset to attempt a match in the correct location of the input

	189 // string. Therefore code generated for a non-trivial trace is specialized

	190 // to that trace. The code generator therefore has the ability to generate

	191 // code for each node several times. In order to limit the size of the

	192 // generated code there is an arbitrary limit on how many specialized sets of

	193 // code may be generated for a given node. If the limit is reached, the

	194 // trace is flushed and a generic version of the code for a node is emitted.

	195 // This is subsequently used for that node. The code emitted for non-generic

	196 // trace is not recorded in the node and so it cannot currently be reused in

	197 // the event that code generation is requested for an identical trace.

	198

	199

	200 void RegExpTree::AppendToText(RegExpText* text) {

	201 UNREACHABLE();

	202 }

	203

	204

	205 void RegExpAtom::AppendToText(RegExpText* text) {

	206 text->AddElement(TextElement::Atom(this));

	207 }

	208

	209

	210 void RegExpCharacterClass::AppendToText(RegExpText* text) {

	211 text->AddElement(TextElement::CharClass(this));

	212 }

	213

	214

	215 void RegExpText::AppendToText(RegExpText* text) {

	216 for (intptr_t i = 0; i < elements()->length(); i++)

	217 text->AddElement((*elements())[i]);

	218 }

	219

	220

	221 TextElement TextElement::Atom(RegExpAtom* atom) {

	222 return TextElement(ATOM, atom);

	223 }

	224

	225

	226 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {

	227 return TextElement(CHAR_CLASS, char_class);

	228 }

	229

	230

	231 intptr_t TextElement::length() const {

	232 switch (text_type()) {

	233 case ATOM:

	234 return atom()->length();

	235

	236 case CHAR_CLASS:

	237 return 1;

	238 }

	239 UNREACHABLE();

	240 return 0;

	241 }

	242

	243

	244 class FrequencyCollator : public ValueObject {

	245 public:

	246 FrequencyCollator() : total_samples_(0) {

	247 for (intptr_t i = 0; i < RegExpMacroAssembler::kTableSize; i++) {

	248 frequencies_[i] = CharacterFrequency(i);

	249 }

	250 }

	251

	252 void CountCharacter(intptr_t character) {

	253 intptr_t index = (character & RegExpMacroAssembler::kTableMask);

	254 frequencies_[index].Increment();

	255 total_samples_++;

	256 }

	257

	258 // Does not measure in percent, but rather per-128 (the table size from the

	259 // regexp macro assembler).

	260 intptr_t Frequency(intptr_t in_character) {

	261 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);

	262 if (total_samples_ < 1) return 1; // Division by zero.

	263 intptr_t freq_in_per128 =

	264 (frequencies_[in_character].counter() * 128) / total_samples_;

	265 return freq_in_per128;

	266 }

	267

	268 private:

	269 class CharacterFrequency {

	270 public:

	271 CharacterFrequency() : counter_(0), character_(-1) { }

	272 explicit CharacterFrequency(intptr_t character)

	273 : counter_(0), character_(character) { }

	274

	275 void Increment() { counter_++; }

	276 intptr_t counter() { return counter_; }

	277 intptr_t character() { return character_; }

	278

	279 private:

	280 intptr_t counter_;

	281 intptr_t character_;

	282

	283 DISALLOW_ALLOCATION();

	284 };

	285

	286

	287 private:

	288 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];

	289 intptr_t total_samples_;

	290 };

	291

	292

	293 class RegExpCompiler : public ValueObject {

	294 public:

	295 RegExpCompiler(intptr_t capture_count,

	296 bool ignore_case,

	297 intptr_t specialization_cid);

	298

	299 intptr_t AllocateRegister() {

	300 return next_register_++;

	301 }

	302

	303 RegExpEngine::CompilationResult Assemble(IRRegExpMacroAssembler* assembler,

	304 RegExpNode* start,

	305 intptr_t capture_count,

	306 const String& pattern);

	307

	308 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }

	309

	310 static const intptr_t kImplementationOffset = 0;

	311 static const intptr_t kNumberOfRegistersOffset = 0;

	312 static const intptr_t kCodeOffset = 1;

	313

	314 IRRegExpMacroAssembler* macro_assembler() { return macro_assembler_; }

	315 EndNode* accept() { return accept_; }

	316

	317 static const intptr_t kMaxRecursion = 100;

	318 inline intptr_t recursion_depth() { return recursion_depth_; }

	319 inline void IncrementRecursionDepth() { recursion_depth_++; }

	320 inline void DecrementRecursionDepth() { recursion_depth_--; }

	321

	322 void SetRegExpTooBig() { reg_exp_too_big_ = true; }

	323

	324 inline bool ignore_case() { return ignore_case_; }

	325 inline bool one_byte() const {

	326 return (specialization_cid_ == kOneByteStringCid \|\|

	327 specialization_cid_ == kExternalOneByteStringCid);

	328 }

	329 inline intptr_t specialization_cid() { return specialization_cid_; }

	330 FrequencyCollator* frequency_collator() { return &frequency_collator_; }

	331

	332 intptr_t current_expansion_factor() { return current_expansion_factor_; }

	333 void set_current_expansion_factor(intptr_t value) {

	334 current_expansion_factor_ = value;

	335 }

	336

	337 Isolate* isolate() const { return isolate_; }

	338

	339 static const intptr_t kNoRegister = -1;

	340

	341 private:

	342 EndNode* accept_;

	343 intptr_t next_register_;

	344 ZoneGrowableArray<RegExpNode> work_list_;

	345 intptr_t recursion_depth_;

	346 IRRegExpMacroAssembler* macro_assembler_;

	347 bool ignore_case_;

	348 intptr_t specialization_cid_;

	349 bool reg_exp_too_big_;

	350 intptr_t current_expansion_factor_;

	351 FrequencyCollator frequency_collator_;

	352 Isolate* isolate_;

	353 };

	354

	355

	356 class RecursionCheck : public ValueObject {

	357 public:

	358 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {

	359 compiler->IncrementRecursionDepth();

	360 }

	361 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }

	362 private:

	363 RegExpCompiler* compiler_;

	364 };

	365

	366

	367 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {

	368 return RegExpEngine::CompilationResult("RegExp too big");

	369 }

	370

	371

	372 // Attempts to compile the regexp using an Irregexp code generator. Returns

	373 // a fixed array or a null handle depending on whether it succeeded.

	374 RegExpCompiler::RegExpCompiler(intptr_t capture_count, bool ignore_case,

	375 intptr_t specialization_cid)

	376 : next_register_(2 * (capture_count + 1)),

	377 work_list_(NULL),

	378 recursion_depth_(0),

	379 ignore_case_(ignore_case),

	380 specialization_cid_(specialization_cid),

	381 reg_exp_too_big_(false),

	382 current_expansion_factor_(1),

	383 isolate_(Isolate::Current()) {

	384 accept_ = new(I) EndNode(EndNode::ACCEPT, I);

	385 }

	386

	387

	388 RegExpEngine::CompilationResult RegExpCompiler::Assemble(

	389 IRRegExpMacroAssembler* macro_assembler,

	390 RegExpNode* start,

	391 intptr_t capture_count,

	392 const String& pattern) {

	393 static const bool use_slow_safe_regexp_compiler = false;

	394

	395 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);

	396 macro_assembler_ = macro_assembler;

	397

	398 ZoneGrowableArray<RegExpNode*> work_list(0);

	399 work_list_ = &work_list;

	400 BlockLabel fail;

	401 macro_assembler_->PushBacktrack(&fail);

	402 Trace new_trace;

	403 start->Emit(this, &new_trace);

	404 macro_assembler_->BindBlock(&fail);

	405 macro_assembler_->Fail();

	406 while (!work_list.is_empty()) {

	407 work_list.RemoveLast()->Emit(this, &new_trace);

	408 }

	409 if (reg_exp_too_big_) return IrregexpRegExpTooBig();

	410

	411 macro_assembler->FinalizeIndirectGotos();

	412

	413 return RegExpEngine::CompilationResult(macro_assembler,

	414 macro_assembler->graph_entry(),

	415 macro_assembler->num_blocks(),

	416 macro_assembler->num_stack_locals());

	417 }

	418

	419

	420 bool Trace::DeferredAction::Mentions(intptr_t that) {

	421 if (action_type() == ActionNode::CLEAR_CAPTURES) {

	422 Interval range = static_cast<DeferredClearCaptures*>(this)->range();

	423 return range.Contains(that);

	424 } else {

	425 return reg() == that;

	426 }

	427 }

	428

	429

	430 bool Trace::mentions_reg(intptr_t reg) {

	431 for (DeferredAction* action = actions_;

	432 action != NULL;

	433 action = action->next()) {

	434 if (action->Mentions(reg))

	435 return true;

	436 }

	437 return false;

	438 }

	439

	440

	441 bool Trace::GetStoredPosition(intptr_t reg, intptr_t* cp_offset) {

	442 ASSERT(*cp_offset == 0);

	443 for (DeferredAction* action = actions_;

	444 action != NULL;

	445 action = action->next()) {

	446 if (action->Mentions(reg)) {

	447 if (action->action_type() == ActionNode::STORE_POSITION) {

	448 cp_offset = static_cast<DeferredCapture>(action)->cp_offset();

	449 return true;

	450 } else {

	451 return false;

	452 }

	453 }

	454 }

	455 return false;

	456 }

	457

	458

	459 // This is called as we come into a loop choice node and some other tricky

	460 // nodes. It normalizes the state of the code generator to ensure we can

	461 // generate generic code.

	462 intptr_t Trace::FindAffectedRegisters(OutSet* affected_registers,

	463 Isolate* isolate) {

	464 intptr_t max_register = RegExpCompiler::kNoRegister;

	465 for (DeferredAction* action = actions_;

	466 action != NULL;

	467 action = action->next()) {

	468 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {

	469 Interval range = static_cast<DeferredClearCaptures*>(action)->range();

	470 for (intptr_t i = range.from(); i <= range.to(); i++)

	471 affected_registers->Set(i, isolate);

	472 if (range.to() > max_register) max_register = range.to();

	473 } else {

	474 affected_registers->Set(action->reg(), isolate);

	475 if (action->reg() > max_register) max_register = action->reg();

	476 }

	477 }

	478 return max_register;

	479 }

	480

	481

	482 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,

	483 intptr_t max_register,

	484 const OutSet& registers_to_pop,

	485 const OutSet& registers_to_clear) {

	486 for (intptr_t reg = max_register; reg >= 0; reg--) {

	487 if (registers_to_pop.Get(reg)) {

	488 assembler->PopRegister(reg);

	489 } else if (registers_to_clear.Get(reg)) {

	490 intptr_t clear_to = reg;

	491 while (reg > 0 && registers_to_clear.Get(reg - 1)) {

	492 reg--;

	493 }

	494 assembler->ClearRegisters(reg, clear_to);

	495 }

	496 }

	497 }

	498

	499

	500 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,

	501 intptr_t max_register,

	502 const OutSet& affected_registers,

	503 OutSet* registers_to_pop,

	504 OutSet* registers_to_clear,

	505 Isolate* isolate) {

	506 for (intptr_t reg = 0; reg <= max_register; reg++) {

	507 if (!affected_registers.Get(reg)) {

	508 continue;

	509 }

	510

	511 // The chronologically first deferred action in the trace

	512 // is used to infer the action needed to restore a register

	513 // to its previous state (or not, if it's safe to ignore it).

	514 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };

	515 DeferredActionUndoType undo_action = IGNORE;

	516

	517 intptr_t value = 0;

	518 bool absolute = false;

	519 bool clear = false;

	520 intptr_t store_position = -1;

	521 // This is a little tricky because we are scanning the actions in reverse

	522 // historical order (newest first).

	523 for (DeferredAction* action = actions_;

	524 action != NULL;

	525 action = action->next()) {

	526 if (action->Mentions(reg)) {

	527 switch (action->action_type()) {

	528 case ActionNode::SET_REGISTER: {

	529 Trace::DeferredSetRegister* psr =

	530 static_cast<Trace::DeferredSetRegister*>(action);

	531 if (!absolute) {

	532 value += psr->value();

	533 absolute = true;

	534 }

	535 // SET_REGISTER is currently only used for newly introduced loop

	536 // counters. They can have a significant previous value if they

	537 // occour in a loop. TODO(lrn): Propagate this information, so

	538 // we can set undo_action to IGNORE if we know there is no value to

	539 // restore.

	540 undo_action = RESTORE;

	541 ASSERT(store_position == -1);

	542 ASSERT(!clear);

	543 break;

	544 }

	545 case ActionNode::INCREMENT_REGISTER:

	546 if (!absolute) {

	547 value++;

	548 }

	549 ASSERT(store_position == -1);

	550 ASSERT(!clear);

	551 undo_action = RESTORE;

	552 break;

	553 case ActionNode::STORE_POSITION: {

	554 Trace::DeferredCapture* pc =

	555 static_cast<Trace::DeferredCapture*>(action);

	556 if (!clear && store_position == -1) {

	557 store_position = pc->cp_offset();

	558 }

	559

	560 // For captures we know that stores and clears alternate.

	561 // Other register, are never cleared, and if the occur

	562 // inside a loop, they might be assigned more than once.

	563 if (reg <= 1) {

	564 // Registers zero and one, aka "capture zero", is

	565 // always set correctly if we succeed. There is no

	566 // need to undo a setting on backtrack, because we

	567 // will set it again or fail.

	568 undo_action = IGNORE;

	569 } else {

	570 undo_action = pc->is_capture() ? CLEAR : RESTORE;

	571 }

	572 ASSERT(!absolute);

	573 ASSERT(value == 0);

	574 break;

	575 }

	576 case ActionNode::CLEAR_CAPTURES: {

	577 // Since we're scanning in reverse order, if we've already

	578 // set the position we have to ignore historically earlier

	579 // clearing operations.

	580 if (store_position == -1) {

	581 clear = true;

	582 }

	583 undo_action = RESTORE;

	584 ASSERT(!absolute);

	585 ASSERT(value == 0);

	586 break;

	587 }

	588 default:

	589 UNREACHABLE();

	590 break;

	591 }

	592 }

	593 }

	594 // Prepare for the undo-action (e.g., push if it's going to be popped).

	595 if (undo_action == RESTORE) {

	596 assembler->PushRegister(reg);

	597 registers_to_pop->Set(reg, isolate);

	598 } else if (undo_action == CLEAR) {

	599 registers_to_clear->Set(reg, isolate);

	600 }

	601 // Perform the chronologically last action (or accumulated increment)

	602 // for the register.

	603 if (store_position != -1) {

	604 assembler->WriteCurrentPositionToRegister(reg, store_position);

	605 } else if (clear) {

	606 assembler->ClearRegisters(reg, reg);

	607 } else if (absolute) {

	608 assembler->SetRegister(reg, value);

	609 } else if (value != 0) {

	610 assembler->AdvanceRegister(reg, value);

	611 }

	612 }

	613 }

	614

	615

	616 // This is called as we come into a loop choice node and some other tricky

	617 // nodes. It normalizes the state of the code generator to ensure we can

	618 // generate generic code.

	619 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {

	620 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	621

	622 ASSERT(!is_trivial());

	623

	624 if (actions_ == NULL && backtrack() == NULL) {

	625 // Here we just have some deferred cp advances to fix and we are back to

	626 // a normal situation. We may also have to forget some information gained

	627 // through a quick check that was already performed.

	628 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);

	629 // Create a new trivial state and generate the node with that.

	630 Trace new_state;

	631 successor->Emit(compiler, &new_state);

	632 return;

	633 }

	634

	635 // Generate deferred actions here along with code to undo them again.

	636 OutSet affected_registers;

	637

	638 if (backtrack() != NULL) {

	639 // Here we have a concrete backtrack location. These are set up by choice

	640 // nodes and so they indicate that we have a deferred save of the current

	641 // position which we may need to emit here.

	642 assembler->PushCurrentPosition();

	643 }

	644

	645 intptr_t max_register = FindAffectedRegisters(&affected_registers, CI);

	646 OutSet registers_to_pop;

	647 OutSet registers_to_clear;

	648 PerformDeferredActions(assembler,

	649 max_register,

	650 affected_registers,

	651 &registers_to_pop,

	652 &registers_to_clear,

	653 CI);

	654 if (cp_offset_ != 0) {

	655 assembler->AdvanceCurrentPosition(cp_offset_);

	656 }

	657

	658 // Create a new trivial state and generate the node with that.

	659 BlockLabel undo;

	660 assembler->PushBacktrack(&undo);

	661 Trace new_state;

	662 successor->Emit(compiler, &new_state);

	663

	664 // On backtrack we need to restore state.

	665 assembler->BindBlock(&undo);

	666 RestoreAffectedRegisters(assembler,

	667 max_register,

	668 registers_to_pop,

	669 registers_to_clear);

	670 if (backtrack() == NULL) {

	671 assembler->Backtrack();

	672 } else {

	673 assembler->PopCurrentPosition();

	674 assembler->GoTo(backtrack());

	675 }

	676 }

	677

	678

	679 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {

	680 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	681

	682 // Omit flushing the trace. We discard the entire stack frame anyway.

	683

	684 if (!label()->IsBound()) {

	685 // We are completely independent of the trace, since we ignore it,

	686 // so this code can be used as the generic version.

	687 assembler->BindBlock(label());

	688 }

	689

	690 // Throw away everything on the backtrack stack since the start

	691 // of the negative submatch and restore the character position.

	692 assembler->ReadCurrentPositionFromRegister(current_position_register_);

	693 assembler->ReadStackPointerFromRegister(stack_pointer_register_);

	694 if (clear_capture_count_ > 0) {

	695 // Clear any captures that might have been performed during the success

	696 // of the body of the negative look-ahead.

	697 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;

	698 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);

	699 }

	700 // Now that we have unwound the stack we find at the top of the stack the

	701 // backtrack that the BeginSubmatch node got.

	702 assembler->Backtrack();

	703 }

	704

	705

	706 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {

	707 if (!trace->is_trivial()) {

	708 trace->Flush(compiler, this);

	709 return;

	710 }

	711 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	712 if (!label()->IsBound()) {

	713 assembler->BindBlock(label());

	714 }

	715 switch (action_) {

	716 case ACCEPT:

	717 assembler->Succeed();

	718 return;

	719 case BACKTRACK:

	720 assembler->GoTo(trace->backtrack());

	721 return;

	722 case NEGATIVE_SUBMATCH_SUCCESS:

	723 // This case is handled in a different virtual method.

	724 UNREACHABLE();

	725 }

	726 UNIMPLEMENTED();

	727 }

	728

	729

	730 void GuardedAlternative::AddGuard(Guard* guard, Isolate* isolate) {

	731 if (guards_ == NULL)

	732 guards_ = new(isolate) ZoneGrowableArray<Guard*>(1);

	733 guards_->Add(guard);

	734 }

	735

	736

	737 ActionNode* ActionNode::SetRegister(intptr_t reg,

	738 intptr_t val,

	739 RegExpNode* on_success) {

	740 ActionNode* result =

	741 new(on_success->isolate()) ActionNode(SET_REGISTER, on_success);

	742 result->data_.u_store_register.reg = reg;

	743 result->data_.u_store_register.value = val;

	744 return result;

	745 }

	746

	747

	748 ActionNode* ActionNode::IncrementRegister(intptr_t reg,

	749 RegExpNode* on_success) {

	750 ActionNode* result =

	751 new(on_success->isolate()) ActionNode(INCREMENT_REGISTER, on_success);

	752 result->data_.u_increment_register.reg = reg;

	753 return result;

	754 }

	755

	756

	757 ActionNode* ActionNode::StorePosition(intptr_t reg,

	758 bool is_capture,

	759 RegExpNode* on_success) {

	760 ActionNode* result =

	761 new(on_success->isolate()) ActionNode(STORE_POSITION, on_success);

	762 result->data_.u_position_register.reg = reg;

	763 result->data_.u_position_register.is_capture = is_capture;

	764 return result;

	765 }

	766

	767

	768 ActionNode* ActionNode::ClearCaptures(Interval range,

	769 RegExpNode* on_success) {

	770 ActionNode* result =

	771 new(on_success->isolate()) ActionNode(CLEAR_CAPTURES, on_success);

	772 result->data_.u_clear_captures.range_from = range.from();

	773 result->data_.u_clear_captures.range_to = range.to();

	774 return result;

	775 }

	776

	777

	778 ActionNode* ActionNode::BeginSubmatch(intptr_t stack_reg,

	779 intptr_t position_reg,

	780 RegExpNode* on_success) {

	781 ActionNode* result =

	782 new(on_success->isolate()) ActionNode(BEGIN_SUBMATCH, on_success);

	783 result->data_.u_submatch.stack_pointer_register = stack_reg;

	784 result->data_.u_submatch.current_position_register = position_reg;

	785 return result;

	786 }

	787

	788

	789 ActionNode* ActionNode::PositiveSubmatchSuccess(intptr_t stack_reg,

	790 intptr_t position_reg,

	791 intptr_t clear_register_count,

	792 intptr_t clear_register_from,

	793 RegExpNode* on_success) {

	794 ActionNode* result =

	795 new(on_success->isolate()) ActionNode(POSITIVE_SUBMATCH_SUCCESS,

	796 on_success);

	797 result->data_.u_submatch.stack_pointer_register = stack_reg;

	798 result->data_.u_submatch.current_position_register = position_reg;

	799 result->data_.u_submatch.clear_register_count = clear_register_count;

	800 result->data_.u_submatch.clear_register_from = clear_register_from;

	801 return result;

	802 }

	803

	804

	805 ActionNode* ActionNode::EmptyMatchCheck(intptr_t start_register,

	806 intptr_t repetition_register,

	807 intptr_t repetition_limit,

	808 RegExpNode* on_success) {

	809 ActionNode* result =

	810 new(on_success->isolate()) ActionNode(EMPTY_MATCH_CHECK, on_success);

	811 result->data_.u_empty_match_check.start_register = start_register;

	812 result->data_.u_empty_match_check.repetition_register = repetition_register;

	813 result->data_.u_empty_match_check.repetition_limit = repetition_limit;

	814 return result;

	815 }

	816

	817

	818 #define DEFINE_ACCEPT(Type) \

	819 void Type##Node::Accept(NodeVisitor* visitor) { \

	820 visitor->Visit##Type(this); \

	821 }

	822 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)

	823 #undef DEFINE_ACCEPT

	824

	825

	826 void LoopChoiceNode::Accept(NodeVisitor* visitor) {

	827 visitor->VisitLoopChoice(this);

	828 }

	829

	830

	831 // -------------------------------------------------------------------

	832 // Emit code.

	833

	834

	835 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,

	836 Guard* guard,

	837 Trace* trace) {

	838 switch (guard->op()) {

	839 case Guard::LT:

	840 ASSERT(!trace->mentions_reg(guard->reg()));

	841 macro_assembler->IfRegisterGE(guard->reg(),

	842 guard->value(),

	843 trace->backtrack());

	844 break;

	845 case Guard::GEQ:

	846 ASSERT(!trace->mentions_reg(guard->reg()));

	847 macro_assembler->IfRegisterLT(guard->reg(),

	848 guard->value(),

	849 trace->backtrack());

	850 break;

	851 }

	852 }

	853

	854

	855 // Returns the number of characters in the equivalence class, omitting those

	856 // that cannot occur in the source string because it is ASCII.

	857 static intptr_t GetCaseIndependentLetters(uint16_t character,

	858 bool one_byte_subject,

	859 int32_t* letters) {

	860 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize;

	861 intptr_t length = jsregexp_uncanonicalize.get(character, '\0', letters);

	862 // Unibrow returns 0 or 1 for characters where case independence is

	863 // trivial.

	864 if (length == 0) {

	865 letters[0] = character;

	866 length = 1;

	867 }

	868 if (!one_byte_subject \|\| character <= Symbols::kMaxOneCharCodeSymbol) {

	869 return length;

	870 }

	871

	872 // The standard requires that non-ASCII characters cannot have ASCII

	873 // character codes in their equivalence class.

	874 // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore,

	875 // is it? For example, \u00C5 is equivalent to \u212B.

	876 return 0;

	877 }

	878

	879

	880 static inline bool EmitSimpleCharacter(Isolate* isolate,

	881 RegExpCompiler* compiler,

	882 uint16_t c,

	883 BlockLabel* on_failure,

	884 intptr_t cp_offset,

	885 bool check,

	886 bool preloaded) {

	887 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	888 bool bound_checked = false;

	889 if (!preloaded) {

	890 assembler->LoadCurrentCharacter(

	891 cp_offset,

	892 on_failure,

	893 check);

	894 bound_checked = true;

	895 }

	896 assembler->CheckNotCharacter(c, on_failure);

	897 return bound_checked;

	898 }

	899

	900

	901 // Only emits non-letters (things that don't have case). Only used for case

	902 // independent matches.

	903 static inline bool EmitAtomNonLetter(Isolate* isolate,

	904 RegExpCompiler* compiler,

	905 uint16_t c,

	906 BlockLabel* on_failure,

	907 intptr_t cp_offset,

	908 bool check,

	909 bool preloaded) {

	910 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	911 bool one_byte = compiler->one_byte();

	912 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	913 intptr_t length = GetCaseIndependentLetters(c, one_byte, chars);

	914 if (length < 1) {

	915 // This can't match. Must be an one-byte subject and a non-one-byte

	916 // character. We do not need to do anything since the one-byte pass

	917 // already handled this.

	918 return false; // Bounds not checked.

	919 }

	920 bool checked = false;

	921 // We handle the length > 1 case in a later pass.

	922 if (length == 1) {

	923 if (one_byte && c > Symbols::kMaxOneCharCodeSymbol) {

	924 // Can't match - see above.

	925 return false; // Bounds not checked.

	926 }

	927 if (!preloaded) {

	928 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

	929 checked = check;

	930 }

	931 macro_assembler->CheckNotCharacter(c, on_failure);

	932 }

	933 return checked;

	934 }

	935

	936

	937 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,

	938 bool one_byte,

	939 uint16_t c1,

	940 uint16_t c2,

	941 BlockLabel* on_failure) {

	942 uint16_t char_mask;

	943 if (one_byte) {

	944 char_mask = Symbols::kMaxOneCharCodeSymbol;

	945 } else {

	946 char_mask = Utf16::kMaxCodeUnit;

	947 }

	948 uint16_t exor = c1 ^ c2;

	949 // Check whether exor has only one bit set.

	950 if (((exor - 1) & exor) == 0) {

	951 // If c1 and c2 differ only by one bit.

	952 // Ecma262UnCanonicalize always gives the highest number last.

	953 ASSERT(c2 > c1);

	954 uint16_t mask = char_mask ^ exor;

	955 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);

	956 return true;

	957 }

	958 ASSERT(c2 > c1);

	959 uint16_t diff = c2 - c1;

	960 if (((diff - 1) & diff) == 0 && c1 >= diff) {

	961 // If the characters differ by 2^n but don't differ by one bit then

	962 // subtract the difference from the found character, then do the or

	963 // trick. We avoid the theoretical case where negative numbers are

	964 // involved in order to simplify code generation.

	965 uint16_t mask = char_mask ^ diff;

	966 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,

	967 diff,

	968 mask,

	969 on_failure);

	970 return true;

	971 }

	972 return false;

	973 }

	974

	975

	976 typedef bool EmitCharacterFunction(Isolate* isolate,

	977 RegExpCompiler* compiler,

	978 uint16_t c,

	979 BlockLabel* on_failure,

	980 intptr_t cp_offset,

	981 bool check,

	982 bool preloaded);

	983

	984 // Only emits letters (things that have case). Only used for case independent

	985 // matches.

	986 static inline bool EmitAtomLetter(Isolate* isolate,

	987 RegExpCompiler* compiler,

	988 uint16_t c,

	989 BlockLabel* on_failure,

	990 intptr_t cp_offset,

	991 bool check,

	992 bool preloaded) {

	993 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	994 bool one_byte = compiler->one_byte();

	995 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	996 intptr_t length = GetCaseIndependentLetters(c, one_byte, chars);

	997 if (length <= 1) return false;

	998 // We may not need to check against the end of the input string

	999 // if this character lies before a character that matched.

	1000 if (!preloaded) {

	1001 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

	1002 }

	1003 BlockLabel ok;

	1004 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);

	1005 switch (length) {

	1006 case 2: {

	1007 if (ShortCutEmitCharacterPair(macro_assembler,

	1008 one_byte,

	1009 chars[0],

	1010 chars[1],

	1011 on_failure)) {

	1012 } else {

	1013 macro_assembler->CheckCharacter(chars[0], &ok);

	1014 macro_assembler->CheckNotCharacter(chars[1], on_failure);

	1015 macro_assembler->BindBlock(&ok);

	1016 }

	1017 break;

	1018 }

	1019 case 4:

	1020 macro_assembler->CheckCharacter(chars[3], &ok);

	1021 // Fall through!

	1022 case 3:

	1023 macro_assembler->CheckCharacter(chars[0], &ok);

	1024 macro_assembler->CheckCharacter(chars[1], &ok);

	1025 macro_assembler->CheckNotCharacter(chars[2], on_failure);

	1026 macro_assembler->BindBlock(&ok);

	1027 break;

	1028 default:

	1029 UNREACHABLE();

	1030 break;

	1031 }

	1032 return true;

	1033 }

	1034

	1035

	1036 static void EmitBoundaryTest(RegExpMacroAssembler* masm,

	1037 intptr_t border,

	1038 BlockLabel* fall_through,

	1039 BlockLabel* above_or_equal,

	1040 BlockLabel* below) {

	1041 if (below != fall_through) {

	1042 masm->CheckCharacterLT(border, below);

	1043 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);

	1044 } else {

	1045 masm->CheckCharacterGT(border - 1, above_or_equal);

	1046 }

	1047 }

	1048

	1049

	1050 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,

	1051 intptr_t first,

	1052 intptr_t last,

	1053 BlockLabel* fall_through,

	1054 BlockLabel* in_range,

	1055 BlockLabel* out_of_range) {

	1056 if (in_range == fall_through) {

	1057 if (first == last) {

	1058 masm->CheckNotCharacter(first, out_of_range);

	1059 } else {

	1060 masm->CheckCharacterNotInRange(first, last, out_of_range);

	1061 }

	1062 } else {

	1063 if (first == last) {

	1064 masm->CheckCharacter(first, in_range);

	1065 } else {

	1066 masm->CheckCharacterInRange(first, last, in_range);

	1067 }

	1068 if (out_of_range != fall_through) masm->GoTo(out_of_range);

	1069 }

	1070 }

	1071

	1072

	1073 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.

	1074 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.

	1075 static void EmitUseLookupTable(

	1076 RegExpMacroAssembler* masm,

	1077 ZoneGrowableArray<int>* ranges,

	1078 intptr_t start_index,

	1079 intptr_t end_index,

	1080 intptr_t min_char,

	1081 BlockLabel* fall_through,

	1082 BlockLabel* even_label,

	1083 BlockLabel* odd_label) {

	1084 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;

	1085 static const intptr_t kMask = RegExpMacroAssembler::kTableMask;

	1086

	1087 intptr_t base = (min_char & ~kMask);

	1088

	1089 // Assert that everything is on one kTableSize page.

	1090 for (intptr_t i = start_index; i <= end_index; i++) {

	1091 ASSERT((ranges->At(i) & ~kMask) == base);

	1092 }

	1093 ASSERT(start_index == 0 \|\| (ranges->At(start_index - 1) & ~kMask) <= base);

	1094

	1095 char templ[kSize];

	1096 BlockLabel* on_bit_set;

	1097 BlockLabel* on_bit_clear;

	1098 intptr_t bit;

	1099 if (even_label == fall_through) {

	1100 on_bit_set = odd_label;

	1101 on_bit_clear = even_label;

	1102 bit = 1;

	1103 } else {

	1104 on_bit_set = even_label;

	1105 on_bit_clear = odd_label;

	1106 bit = 0;

	1107 }

	1108 for (intptr_t i = 0; i < (ranges->At(start_index) & kMask) && i < kSize;

	1109 i++) {

	1110 templ[i] = bit;

	1111 }

	1112 intptr_t j = 0;

	1113 bit ^= 1;

	1114 for (intptr_t i = start_index; i < end_index; i++) {

	1115 for (j = (ranges->At(i) & kMask); j < (ranges->At(i + 1) & kMask); j++) {

	1116 templ[j] = bit;

	1117 }

	1118 bit ^= 1;

	1119 }

	1120 for (intptr_t i = j; i < kSize; i++) {

	1121 templ[i] = bit;

	1122 }

	1123 // TODO(erikcorry): Cache these.

	1124 const TypedData& ba = TypedData::ZoneHandle(

	1125 masm->isolate(),

	1126 TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));

	1127 for (intptr_t i = 0; i < kSize; i++) {

	1128 ba.SetUint8(i, templ[i]);

	1129 }

	1130 masm->CheckBitInTable(ba, on_bit_set);

	1131 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);

	1132 }

	1133

	1134

	1135 static void CutOutRange(RegExpMacroAssembler* masm,

	1136 ZoneGrowableArray<int>* ranges,

	1137 intptr_t start_index,

	1138 intptr_t end_index,

	1139 intptr_t cut_index,

	1140 BlockLabel* even_label,

	1141 BlockLabel* odd_label) {

	1142 bool odd = (((cut_index - start_index) & 1) == 1);

	1143 BlockLabel* in_range_label = odd ? odd_label : even_label;

	1144 BlockLabel dummy;

	1145 EmitDoubleBoundaryTest(masm,

	1146 ranges->At(cut_index),

	1147 ranges->At(cut_index + 1) - 1,

	1148 &dummy,

	1149 in_range_label,

	1150 &dummy);

	1151 ASSERT(!dummy.IsLinked());

	1152 // Cut out the single range by rewriting the array. This creates a new

	1153 // range that is a merger of the two ranges on either side of the one we

	1154 // are cutting out. The oddity of the labels is preserved.

	1155 for (intptr_t j = cut_index; j > start_index; j--) {

	1156 (*ranges)[j] = ranges->At(j - 1);

	1157 }

	1158 for (intptr_t j = cut_index + 1; j < end_index; j++) {

	1159 (*ranges)[j] = ranges->At(j + 1);

	1160 }

	1161 }

	1162

	1163

	1164 // Unicode case. Split the search space into kSize spaces that are handled

	1165 // with recursion.

	1166 static void SplitSearchSpace(ZoneGrowableArray<int>* ranges,

	1167 intptr_t start_index,

	1168 intptr_t end_index,

	1169 intptr_t* new_start_index,

	1170 intptr_t* new_end_index,

	1171 intptr_t* border) {

	1172 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;

	1173 static const intptr_t kMask = RegExpMacroAssembler::kTableMask;

	1174

	1175 intptr_t first = ranges->At(start_index);

	1176 intptr_t last = ranges->At(end_index) - 1;

	1177

	1178 *new_start_index = start_index;

	1179 *border = (ranges->At(start_index) & ~kMask) + kSize;

	1180 while (*new_start_index < end_index) {

	1181 if (ranges->At(new_start_index) > border) break;

	1182 (*new_start_index)++;

	1183 }

	1184 // new_start_index is the index of the first edge that is beyond the

	1185 // current kSize space.

	1186

	1187 // For very large search spaces we do a binary chop search of the non-Latin1

	1188 // space instead of just going to the end of the current kSize space. The

	1189 // heuristics are complicated a little by the fact that any 128-character

	1190 // encoding space can be quickly tested with a table lookup, so we don't

	1191 // wish to do binary chop search at a smaller granularity than that. A

	1192 // 128-character space can take up a lot of space in the ranges array if,

	1193 // for example, we only want to match every second character (eg. the lower

	1194 // case characters on some Unicode pages).

	1195 intptr_t binary_chop_index = (end_index + start_index) / 2;

	1196 // The first test ensures that we get to the code that handles the Latin1

	1197 // range with a single not-taken branch, speeding up this important

	1198 // character range (even non-Latin1 charset-based text has spaces and

	1199 // punctuation).

	1200 if (*border - 1 > Symbols::kMaxOneCharCodeSymbol && // Latin1 case.

	1201 end_index - start_index > (new_start_index - start_index) 2 &&

	1202 last - first > kSize * 2 &&

	1203 binary_chop_index > *new_start_index &&

	1204 ranges->At(binary_chop_index) >= first + 2 * kSize) {

	1205 intptr_t scan_forward_for_section_border = binary_chop_index;;

	1206 intptr_t new_border = (ranges->At(binary_chop_index) \| kMask) + 1;

	1207

	1208 while (scan_forward_for_section_border < end_index) {

	1209 if (ranges->At(scan_forward_for_section_border) > new_border) {

	1210 *new_start_index = scan_forward_for_section_border;

	1211 *border = new_border;

	1212 break;

	1213 }

	1214 scan_forward_for_section_border++;

	1215 }

	1216 }

	1217

	1218 ASSERT(*new_start_index > start_index);

	1219 new_end_index = new_start_index - 1;

	1220 if (ranges->At(new_end_index) == border) {

	1221 (*new_end_index)--;

	1222 }

	1223 if (*border >= ranges->At(end_index)) {

	1224 *border = ranges->At(end_index);

	1225 *new_start_index = end_index; // Won't be used.

	1226 *new_end_index = end_index - 1;

	1227 }

	1228 }

	1229

	1230

	1231 // Gets a series of segment boundaries representing a character class. If the

	1232 // character is in the range between an even and an odd boundary (counting from

	1233 // start_index) then go to even_label, otherwise go to odd_label. We already

	1234 // know that the character is in the range of min_char to max_char inclusive.

	1235 // Either label can be NULL indicating backtracking. Either label can also be

	1236 // equal to the fall_through label.

	1237 static void GenerateBranches(RegExpMacroAssembler* masm,

	1238 ZoneGrowableArray<int>* ranges,

	1239 intptr_t start_index,

	1240 intptr_t end_index,

	1241 uint16_t min_char,

	1242 uint16_t max_char,

	1243 BlockLabel* fall_through,

	1244 BlockLabel* even_label,

	1245 BlockLabel* odd_label) {

	1246 intptr_t first = ranges->At(start_index);

	1247 intptr_t last = ranges->At(end_index) - 1;

	1248

	1249 ASSERT(min_char < first);

	1250

	1251 // Just need to test if the character is before or on-or-after

	1252 // a particular character.

	1253 if (start_index == end_index) {

	1254 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);

	1255 return;

	1256 }

	1257

	1258 // Another almost trivial case: There is one interval in the middle that is

	1259 // different from the end intervals.

	1260 if (start_index + 1 == end_index) {

	1261 EmitDoubleBoundaryTest(

	1262 masm, first, last, fall_through, even_label, odd_label);

	1263 return;

	1264 }

	1265

	1266 // It's not worth using table lookup if there are very few intervals in the

	1267 // character class.

	1268 if (end_index - start_index <= 6) {

	1269 // It is faster to test for individual characters, so we look for those

	1270 // first, then try arbitrary ranges in the second round.

	1271 static intptr_t kNoCutIndex = -1;

	1272 intptr_t cut = kNoCutIndex;

	1273 for (intptr_t i = start_index; i < end_index; i++) {

	1274 if (ranges->At(i) == ranges->At(i + 1) - 1) {

	1275 cut = i;

	1276 break;

	1277 }

	1278 }

	1279 if (cut == kNoCutIndex) cut = start_index;

	1280 CutOutRange(

	1281 masm, ranges, start_index, end_index, cut, even_label, odd_label);

	1282 ASSERT(end_index - start_index >= 2);

	1283 GenerateBranches(masm,

	1284 ranges,

	1285 start_index + 1,

	1286 end_index - 1,

	1287 min_char,

	1288 max_char,

	1289 fall_through,

	1290 even_label,

	1291 odd_label);

	1292 return;

	1293 }

	1294

	1295 // If there are a lot of intervals in the regexp, then we will use tables to

	1296 // determine whether the character is inside or outside the character class.

	1297 static const intptr_t kBits = RegExpMacroAssembler::kTableSizeBits;

	1298

	1299 if ((max_char >> kBits) == (min_char >> kBits)) {

	1300 EmitUseLookupTable(masm,

	1301 ranges,

	1302 start_index,

	1303 end_index,

	1304 min_char,

	1305 fall_through,

	1306 even_label,

	1307 odd_label);

	1308 return;

	1309 }

	1310

	1311 if ((min_char >> kBits) != (first >> kBits)) {

	1312 masm->CheckCharacterLT(first, odd_label);

	1313 GenerateBranches(masm,

	1314 ranges,

	1315 start_index + 1,

	1316 end_index,

	1317 first,

	1318 max_char,

	1319 fall_through,

	1320 odd_label,

	1321 even_label);

	1322 return;

	1323 }

	1324

	1325 intptr_t new_start_index = 0;

	1326 intptr_t new_end_index = 0;

	1327 intptr_t border = 0;

	1328

	1329 SplitSearchSpace(ranges,

	1330 start_index,

	1331 end_index,

	1332 &new_start_index,

	1333 &new_end_index,

	1334 &border);

	1335

	1336 BlockLabel handle_rest;

	1337 BlockLabel* above = &handle_rest;

	1338 if (border == last + 1) {

	1339 // We didn't find any section that started after the limit, so everything

	1340 // above the border is one of the terminal labels.

	1341 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;

	1342 ASSERT(new_end_index == end_index - 1);

	1343 }

	1344

	1345 ASSERT(start_index <= new_end_index);

	1346 ASSERT(new_start_index <= end_index);

	1347 ASSERT(start_index < new_start_index);

	1348 ASSERT(new_end_index < end_index);

	1349 ASSERT(new_end_index + 1 == new_start_index \|\|

	1350 (new_end_index + 2 == new_start_index &&

	1351 border == ranges->At(new_end_index + 1)));

	1352 ASSERT(min_char < border - 1);

	1353 ASSERT(border < max_char);

	1354 ASSERT(ranges->At(new_end_index) < border);

	1355 ASSERT(border < ranges->At(new_start_index) \|\|

	1356 (border == ranges->At(new_start_index) &&

	1357 new_start_index == end_index &&

	1358 new_end_index == end_index - 1 &&

	1359 border == last + 1));

	1360 ASSERT(new_start_index == 0 \|\| border >= ranges->At(new_start_index - 1));

	1361

	1362 masm->CheckCharacterGT(border - 1, above);

	1363 BlockLabel dummy;

	1364 GenerateBranches(masm,

	1365 ranges,

	1366 start_index,

	1367 new_end_index,

	1368 min_char,

	1369 border - 1,

	1370 &dummy,

	1371 even_label,

	1372 odd_label);

	1373

	1374 if (handle_rest.IsLinked()) {

	1375 masm->BindBlock(&handle_rest);

	1376 bool flip = (new_start_index & 1) != (start_index & 1);

	1377 GenerateBranches(masm,

	1378 ranges,

	1379 new_start_index,

	1380 end_index,

	1381 border,

	1382 max_char,

	1383 &dummy,

	1384 flip ? odd_label : even_label,

	1385 flip ? even_label : odd_label);

	1386 }

	1387 }

	1388

	1389

	1390 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,

	1391 RegExpCharacterClass* cc,

	1392 bool one_byte,

	1393 BlockLabel* on_failure,

	1394 intptr_t cp_offset,

	1395 bool check_offset,

	1396 bool preloaded,

	1397 Isolate* isolate) {

	1398 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();

	1399 if (!CharacterRange::IsCanonical(ranges)) {

	1400 CharacterRange::Canonicalize(ranges);

	1401 }

	1402

	1403 intptr_t max_char;

	1404 if (one_byte) {

	1405 max_char = Symbols::kMaxOneCharCodeSymbol;

	1406 } else {

	1407 max_char = Utf16::kMaxCodeUnit;

	1408 }

	1409

	1410 intptr_t range_count = ranges->length();

	1411

	1412 intptr_t last_valid_range = range_count - 1;

	1413 while (last_valid_range >= 0) {

	1414 CharacterRange& range = (*ranges)[last_valid_range];

	1415 if (range.from() <= max_char) {

	1416 break;

	1417 }

	1418 last_valid_range--;

	1419 }

	1420

	1421 if (last_valid_range < 0) {

	1422 if (!cc->is_negated()) {

	1423 macro_assembler->GoTo(on_failure);

	1424 }

	1425 if (check_offset) {

	1426 macro_assembler->CheckPosition(cp_offset, on_failure);

	1427 }

	1428 return;

	1429 }

	1430

	1431 if (last_valid_range == 0 &&

	1432 ranges->At(0).IsEverything(max_char)) {

	1433 if (cc->is_negated()) {

	1434 macro_assembler->GoTo(on_failure);

	1435 } else {

	1436 // This is a common case hit by non-anchored expressions.

	1437 if (check_offset) {

	1438 macro_assembler->CheckPosition(cp_offset, on_failure);

	1439 }

	1440 }

	1441 return;

	1442 }

	1443 if (last_valid_range == 0 &&

	1444 !cc->is_negated() &&

	1445 ranges->At(0).IsEverything(max_char)) {

	1446 // This is a common case hit by non-anchored expressions.

	1447 if (check_offset) {

	1448 macro_assembler->CheckPosition(cp_offset, on_failure);

	1449 }

	1450 return;

	1451 }

	1452

	1453 if (!preloaded) {

	1454 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);

	1455 }

	1456

	1457 if (cc->is_standard() &&

	1458 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),

	1459 on_failure)) {

	1460 return;

	1461 }

	1462

	1463

	1464 // A new list with ascending entries. Each entry is a code unit

	1465 // where there is a boundary between code units that are part of

	1466 // the class and code units that are not. Normally we insert an

	1467 // entry at zero which goes to the failure label, but if there

	1468 // was already one there we fall through for success on that entry.

	1469 // Subsequent entries have alternating meaning (success/failure).

	1470 ZoneGrowableArray<int>* range_boundaries =

	1471 new(isolate) ZoneGrowableArray<int>(last_valid_range);

	1472

	1473 bool zeroth_entry_is_failure = !cc->is_negated();

	1474

	1475 for (intptr_t i = 0; i <= last_valid_range; i++) {

	1476 CharacterRange& range = (*ranges)[i];

	1477 if (range.from() == 0) {

	1478 ASSERT(i == 0);

	1479 zeroth_entry_is_failure = !zeroth_entry_is_failure;

	1480 } else {

	1481 range_boundaries->Add(range.from());

	1482 }

	1483 range_boundaries->Add(range.to() + 1);

	1484 }

	1485 intptr_t end_index = range_boundaries->length() - 1;

	1486 if (range_boundaries->At(end_index) > max_char) {

	1487 end_index--;

	1488 }

	1489

	1490 BlockLabel fall_through;

	1491 GenerateBranches(macro_assembler,

	1492 range_boundaries,

	1493 0, // start_index.

	1494 end_index,

	1495 0, // min_char.

	1496 max_char,

	1497 &fall_through,

	1498 zeroth_entry_is_failure ? &fall_through : on_failure,

	1499 zeroth_entry_is_failure ? on_failure : &fall_through);

	1500 macro_assembler->BindBlock(&fall_through);

	1501 }

	1502

	1503

	1504 RegExpNode::~RegExpNode() {

	1505 }

	1506

	1507

	1508 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,

	1509 Trace* trace) {

	1510 // If we are generating a greedy loop then don't stop and don't reuse code.

	1511 if (trace->stop_node() != NULL) {

	1512 return CONTINUE;

	1513 }

	1514

	1515 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	1516 if (trace->is_trivial()) {

	1517 if (label_.IsBound()) {

	1518 // We are being asked to generate a generic version, but that's already

	1519 // been done so just go to it.

	1520 macro_assembler->GoTo(&label_);

	1521 return DONE;

	1522 }

	1523 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {

	1524 // To avoid too deep recursion we push the node to the work queue and just

	1525 // generate a goto here.

	1526 compiler->AddWork(this);

	1527 macro_assembler->GoTo(&label_);

	1528 return DONE;

	1529 }

	1530 // Generate generic version of the node and bind the label for later use.

	1531 macro_assembler->BindBlock(&label_);

	1532 return CONTINUE;

	1533 }

	1534

	1535 // We are being asked to make a non-generic version. Keep track of how many

	1536 // non-generic versions we generate so as not to overdo it.

	1537 trace_count_++;

	1538 if (kRegexpOptimization &&

	1539 trace_count_ < kMaxCopiesCodeGenerated &&

	1540 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {

	1541 return CONTINUE;

	1542 }

	1543

	1544 // If we get here code has been generated for this node too many times or

	1545 // recursion is too deep. Time to switch to a generic version. The code for

	1546 // generic versions above can handle deep recursion properly.

	1547 trace->Flush(compiler, this);

	1548 return DONE;

	1549 }

	1550

	1551

	1552 intptr_t ActionNode::EatsAtLeast(intptr_t still_to_find,

	1553 intptr_t budget,

	1554 bool not_at_start) {

	1555 if (budget <= 0) return 0;

	1556 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!

	1557 return on_success()->EatsAtLeast(still_to_find,

	1558 budget - 1,

	1559 not_at_start);

	1560 }

	1561

	1562

	1563 void ActionNode::FillInBMInfo(intptr_t offset,

	1564 intptr_t budget,

	1565 BoyerMooreLookahead* bm,

	1566 bool not_at_start) {

	1567 if (action_type_ == BEGIN_SUBMATCH) {

	1568 bm->SetRest(offset);

	1569 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {

	1570 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);

	1571 }

	1572 SaveBMInfo(bm, not_at_start, offset);

	1573 }

	1574

	1575

	1576 intptr_t AssertionNode::EatsAtLeast(intptr_t still_to_find,

	1577 intptr_t budget,

	1578 bool not_at_start) {

	1579 if (budget <= 0) return 0;

	1580 // If we know we are not at the start and we are asked "how many characters

	1581 // will you match if you succeed?" then we can answer anything since false

	1582 // implies false. So lets just return the max answer (still_to_find) since

	1583 // that won't prevent us from preloading a lot of characters for the other

	1584 // branches in the node graph.

	1585 if (assertion_type() == AT_START && not_at_start) return still_to_find;

	1586 return on_success()->EatsAtLeast(still_to_find,

	1587 budget - 1,

	1588 not_at_start);

	1589 }

	1590

	1591

	1592 void AssertionNode::FillInBMInfo(intptr_t offset,

	1593 intptr_t budget,

	1594 BoyerMooreLookahead* bm,

	1595 bool not_at_start) {

	1596 // Match the behaviour of EatsAtLeast on this node.

	1597 if (assertion_type() == AT_START && not_at_start) return;

	1598 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);

	1599 SaveBMInfo(bm, not_at_start, offset);

	1600 }

	1601

	1602

	1603 intptr_t BackReferenceNode::EatsAtLeast(intptr_t still_to_find,

	1604 intptr_t budget,

	1605 bool not_at_start) {

	1606 if (budget <= 0) return 0;

	1607 return on_success()->EatsAtLeast(still_to_find,

	1608 budget - 1,

	1609 not_at_start);

	1610 }

	1611

	1612

	1613 intptr_t TextNode::EatsAtLeast(intptr_t still_to_find,

	1614 intptr_t budget,

	1615 bool not_at_start) {

	1616 intptr_t answer = Length();

	1617 if (answer >= still_to_find) return answer;

	1618 if (budget <= 0) return answer;

	1619 // We are not at start after this node so we set the last argument to 'true'.

	1620 return answer + on_success()->EatsAtLeast(still_to_find - answer,

	1621 budget - 1,

	1622 true);

	1623 }

	1624

	1625

	1626 intptr_t NegativeLookaheadChoiceNode::EatsAtLeast(intptr_t still_to_find,

	1627 intptr_t budget,

	1628 bool not_at_start) {

	1629 if (budget <= 0) return 0;

	1630 // Alternative 0 is the negative lookahead, alternative 1 is what comes

	1631 // afterwards.

	1632 RegExpNode* node = (*alternatives_)[1].node();

	1633 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);

	1634 }

	1635

	1636

	1637 void NegativeLookaheadChoiceNode::GetQuickCheckDetails(

	1638 QuickCheckDetails* details,

	1639 RegExpCompiler* compiler,

	1640 intptr_t filled_in,

	1641 bool not_at_start) {

	1642 // Alternative 0 is the negative lookahead, alternative 1 is what comes

	1643 // afterwards.

	1644 RegExpNode* node = (*alternatives_)[1].node();

	1645 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);

	1646 }

	1647

	1648

	1649 intptr_t ChoiceNode::EatsAtLeastHelper(intptr_t still_to_find,

	1650 intptr_t budget,

	1651 RegExpNode* ignore_this_node,

	1652 bool not_at_start) {

	1653 if (budget <= 0) return 0;

	1654 intptr_t min = 100;

	1655 intptr_t choice_count = alternatives_->length();

	1656 budget = (budget - 1) / choice_count;

	1657 for (intptr_t i = 0; i < choice_count; i++) {

	1658 RegExpNode* node = (*alternatives_)[i].node();

	1659 if (node == ignore_this_node) continue;

	1660 intptr_t node_eats_at_least =

	1661 node->EatsAtLeast(still_to_find, budget, not_at_start);

	1662 if (node_eats_at_least < min) min = node_eats_at_least;

	1663 if (min == 0) return 0;

	1664 }

	1665 return min;

	1666 }

	1667

	1668

	1669 intptr_t LoopChoiceNode::EatsAtLeast(intptr_t still_to_find,

	1670 intptr_t budget,

	1671 bool not_at_start) {

	1672 return EatsAtLeastHelper(still_to_find,

	1673 budget - 1,

	1674 loop_node_,

	1675 not_at_start);

	1676 }

	1677

	1678

	1679 intptr_t ChoiceNode::EatsAtLeast(intptr_t still_to_find,

	1680 intptr_t budget,

	1681 bool not_at_start) {

	1682 return EatsAtLeastHelper(still_to_find,

	1683 budget,

	1684 NULL,

	1685 not_at_start);

	1686 }

	1687

	1688

	1689 // Takes the left-most 1-bit and smears it out, setting all bits to its right.

	1690 static inline uint32_t SmearBitsRight(uint32_t v) {

	1691 v \|= v >> 1;

	1692 v \|= v >> 2;

	1693 v \|= v >> 4;

	1694 v \|= v >> 8;

	1695 v \|= v >> 16;

	1696 return v;

	1697 }

	1698

	1699

	1700 bool QuickCheckDetails::Rationalize(bool asc) {

	1701 bool found_useful_op = false;

	1702 uint32_t char_mask;

	1703 if (asc) {

	1704 char_mask = Symbols::kMaxOneCharCodeSymbol;

	1705 } else {

	1706 char_mask = Utf16::kMaxCodeUnit;

	1707 }

	1708 mask_ = 0;

	1709 value_ = 0;

	1710 intptr_t char_shift = 0;

	1711 for (intptr_t i = 0; i < characters_; i++) {

	1712 Position* pos = &positions_[i];

	1713 if ((pos->mask & Symbols::kMaxOneCharCodeSymbol) != 0) {

	1714 found_useful_op = true;

	1715 }

	1716 mask_ \|= (pos->mask & char_mask) << char_shift;

	1717 value_ \|= (pos->value & char_mask) << char_shift;

	1718 char_shift += asc ? 8 : 16;

	1719 }

	1720 return found_useful_op;

	1721 }

	1722

	1723

	1724 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,

	1725 Trace* bounds_check_trace,

	1726 Trace* trace,

	1727 bool preload_has_checked_bounds,

	1728 BlockLabel* on_possible_success,

	1729 QuickCheckDetails* details,

	1730 bool fall_through_on_failure) {

	1731 if (details->characters() == 0) return false;

	1732 GetQuickCheckDetails(

	1733 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);

	1734 if (details->cannot_match()) return false;

	1735 if (!details->Rationalize(compiler->one_byte())) return false;

	1736 ASSERT(details->characters() == 1 \|\|

	1737 compiler->macro_assembler()->CanReadUnaligned());

	1738 uint32_t mask = details->mask();

	1739 uint32_t value = details->value();

	1740

	1741 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	1742

	1743 if (trace->characters_preloaded() != details->characters()) {

	1744 ASSERT(trace->cp_offset() == bounds_check_trace->cp_offset());

	1745 // We are attempting to preload the minimum number of characters

	1746 // any choice would eat, so if the bounds check fails, then none of the

	1747 // choices can succeed, so we can just immediately backtrack, rather

	1748 // than go to the next choice.

	1749 assembler->LoadCurrentCharacter(trace->cp_offset(),

	1750 bounds_check_trace->backtrack(),

	1751 !preload_has_checked_bounds,

	1752 details->characters());

	1753 }

	1754

	1755

	1756 bool need_mask = true;

	1757

	1758 if (details->characters() == 1) {

	1759 // If number of characters preloaded is 1 then we used a byte or 16 bit

	1760 // load so the value is already masked down.

	1761 uint32_t char_mask;

	1762 if (compiler->one_byte()) {

	1763 char_mask = Symbols::kMaxOneCharCodeSymbol;

	1764 } else {

	1765 char_mask = Utf16::kMaxCodeUnit;

	1766 }

	1767 if ((mask & char_mask) == char_mask) need_mask = false;

	1768 mask &= char_mask;

	1769 } else {

	1770 // For 2-character preloads in one-byte mode or 1-character preloads in

	1771 // two-byte mode we also use a 16 bit load with zero extend.

	1772 if (details->characters() == 2 && compiler->one_byte()) {

	1773 if ((mask & 0xffff) == 0xffff) need_mask = false;

	1774 } else if (details->characters() == 1 && !compiler->one_byte()) {

	1775 if ((mask & 0xffff) == 0xffff) need_mask = false;

	1776 } else {

	1777 if (mask == 0xffffffff) need_mask = false;

	1778 }

	1779 }

	1780

	1781 if (fall_through_on_failure) {

	1782 if (need_mask) {

	1783 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);

	1784 } else {

	1785 assembler->CheckCharacter(value, on_possible_success);

	1786 }

	1787 } else {

	1788 if (need_mask) {

	1789 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());

	1790 } else {

	1791 assembler->CheckNotCharacter(value, trace->backtrack());

	1792 }

	1793 }

	1794 return true;

	1795 }

	1796

	1797

	1798 // Here is the meat of GetQuickCheckDetails (see also the comment on the

	1799 // super-class in the .h file).

	1800 //

	1801 // We iterate along the text object, building up for each character a

	1802 // mask and value that can be used to test for a quick failure to match.

	1803 // The masks and values for the positions will be combined into a single

	1804 // machine word for the current character width in order to be used in

	1805 // generating a quick check.

	1806 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,

	1807 RegExpCompiler* compiler,

	1808 intptr_t characters_filled_in,

	1809 bool not_at_start) {

	1810 ASSERT(characters_filled_in < details->characters());

	1811 intptr_t characters = details->characters();

	1812 intptr_t char_mask;

	1813 if (compiler->one_byte()) {

	1814 char_mask = Symbols::kMaxOneCharCodeSymbol;

	1815 } else {

	1816 char_mask = Utf16::kMaxCodeUnit;

	1817 }

	1818 for (intptr_t k = 0; k < elms_->length(); k++) {

	1819 TextElement elm = elms_->At(k);

	1820 if (elm.text_type() == TextElement::ATOM) {

	1821 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();

	1822 for (intptr_t i = 0; i < characters && i < quarks->length(); i++) {

	1823 QuickCheckDetails::Position* pos =

	1824 details->positions(characters_filled_in);

	1825 uint16_t c = quarks->At(i);

	1826 if (c > char_mask) {

	1827 // If we expect a non-Latin1 character from an one-byte string,

	1828 // there is no way we can match. Not even case independent

	1829 // matching can turn an Latin1 character into non-Latin1 or

	1830 // vice versa.

	1831 // TODO(dcarney): issue 3550. Verify that this works as expected.

	1832 // For example, \u0178 is uppercase of \u00ff (y-umlaut).

	1833 details->set_cannot_match();

	1834 pos->determines_perfectly = false;

	1835 return;

	1836 }

	1837 if (compiler->ignore_case()) {

	1838 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	1839 intptr_t length =

	1840 GetCaseIndependentLetters(c, compiler->one_byte(), chars);

	1841 ASSERT(length != 0); // Can only happen if c > char_mask (see above).

	1842 if (length == 1) {

	1843 // This letter has no case equivalents, so it's nice and simple

	1844 // and the mask-compare will determine definitely whether we have

	1845 // a match at this character position.

	1846 pos->mask = char_mask;

	1847 pos->value = c;

	1848 pos->determines_perfectly = true;

	1849 } else {

	1850 uint32_t common_bits = char_mask;

	1851 uint32_t bits = chars[0];

	1852 for (intptr_t j = 1; j < length; j++) {

	1853 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);

	1854 common_bits ^= differing_bits;

	1855 bits &= common_bits;

	1856 }

	1857 // If length is 2 and common bits has only one zero in it then

	1858 // our mask and compare instruction will determine definitely

	1859 // whether we have a match at this character position. Otherwise

	1860 // it can only be an approximate check.

	1861 uint32_t one_zero = (common_bits \| ~char_mask);

	1862 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {

	1863 pos->determines_perfectly = true;

	1864 }

	1865 pos->mask = common_bits;

	1866 pos->value = bits;

	1867 }

	1868 } else {

	1869 // Don't ignore case. Nice simple case where the mask-compare will

	1870 // determine definitely whether we have a match at this character

	1871 // position.

	1872 pos->mask = char_mask;

	1873 pos->value = c;

	1874 pos->determines_perfectly = true;

	1875 }

	1876 characters_filled_in++;

	1877 ASSERT(characters_filled_in <= details->characters());

	1878 if (characters_filled_in == details->characters()) {

	1879 return;

	1880 }

	1881 }

	1882 } else {

	1883 QuickCheckDetails::Position* pos =

	1884 details->positions(characters_filled_in);

	1885 RegExpCharacterClass* tree = elm.char_class();

	1886 ZoneGrowableArray<CharacterRange>* ranges = tree->ranges();

	1887 if (tree->is_negated()) {

	1888 // A quick check uses multi-character mask and compare. There is no

	1889 // useful way to incorporate a negative char class into this scheme

	1890 // so we just conservatively create a mask and value that will always

	1891 // succeed.

	1892 pos->mask = 0;

	1893 pos->value = 0;

	1894 } else {

	1895 intptr_t first_range = 0;

	1896 while (ranges->At(first_range).from() > char_mask) {

	1897 first_range++;

	1898 if (first_range == ranges->length()) {

	1899 details->set_cannot_match();

	1900 pos->determines_perfectly = false;

	1901 return;

	1902 }

	1903 }

	1904 CharacterRange range = ranges->At(first_range);

	1905 uint16_t from = range.from();

	1906 uint16_t to = range.to();

	1907 if (to > char_mask) {

	1908 to = char_mask;

	1909 }

	1910 uint32_t differing_bits = (from ^ to);

	1911 // A mask and compare is only perfect if the differing bits form a

	1912 // number like 00011111 with one single block of trailing 1s.

	1913 if ((differing_bits & (differing_bits + 1)) == 0 &&

	1914 from + differing_bits == to) {

	1915 pos->determines_perfectly = true;

	1916 }

	1917 uint32_t common_bits = ~SmearBitsRight(differing_bits);

	1918 uint32_t bits = (from & common_bits);

	1919 for (intptr_t i = first_range + 1; i < ranges->length(); i++) {

	1920 CharacterRange range = ranges->At(i);

	1921 uint16_t from = range.from();

	1922 uint16_t to = range.to();

	1923 if (from > char_mask) continue;

	1924 if (to > char_mask) to = char_mask;

	1925 // Here we are combining more ranges into the mask and compare

	1926 // value. With each new range the mask becomes more sparse and

	1927 // so the chances of a false positive rise. A character class

	1928 // with multiple ranges is assumed never to be equivalent to a

	1929 // mask and compare operation.

	1930 pos->determines_perfectly = false;

	1931 uint32_t new_common_bits = (from ^ to);

	1932 new_common_bits = ~SmearBitsRight(new_common_bits);

	1933 common_bits &= new_common_bits;

	1934 bits &= new_common_bits;

	1935 uint32_t differing_bits = (from & common_bits) ^ bits;

	1936 common_bits ^= differing_bits;

	1937 bits &= common_bits;

	1938 }

	1939 pos->mask = common_bits;

	1940 pos->value = bits;

	1941 }

	1942 characters_filled_in++;

	1943 ASSERT(characters_filled_in <= details->characters());

	1944 if (characters_filled_in == details->characters()) {

	1945 return;

	1946 }

	1947 }

	1948 }

	1949 ASSERT(characters_filled_in != details->characters());

	1950 if (!details->cannot_match()) {

	1951 on_success()-> GetQuickCheckDetails(details,

	1952 compiler,

	1953 characters_filled_in,

	1954 true);

	1955 }

	1956 }

	1957

	1958

	1959 void QuickCheckDetails::Clear() {

	1960 for (int i = 0; i < characters_; i++) {

	1961 positions_[i].mask = 0;

	1962 positions_[i].value = 0;

	1963 positions_[i].determines_perfectly = false;

	1964 }

	1965 characters_ = 0;

	1966 }

	1967

	1968

	1969 void QuickCheckDetails::Advance(intptr_t by, bool one_byte) {

	1970 ASSERT(by >= 0);

	1971 if (by >= characters_) {

	1972 Clear();

	1973 return;

	1974 }

	1975 for (intptr_t i = 0; i < characters_ - by; i++) {

	1976 positions_[i] = positions_[by + i];

	1977 }

	1978 for (intptr_t i = characters_ - by; i < characters_; i++) {

	1979 positions_[i].mask = 0;

	1980 positions_[i].value = 0;

	1981 positions_[i].determines_perfectly = false;

	1982 }

	1983 characters_ -= by;

	1984 // We could change mask_ and value_ here but we would never advance unless

	1985 // they had already been used in a check and they won't be used again because

	1986 // it would gain us nothing. So there's no point.

	1987 }

	1988

	1989

	1990 void QuickCheckDetails::Merge(QuickCheckDetails* other, intptr_t from_index) {

	1991 ASSERT(characters_ == other->characters_);

	1992 if (other->cannot_match_) {

	1993 return;

	1994 }

	1995 if (cannot_match_) {

	1996 this = other;

	1997 return;

	1998 }

	1999 for (intptr_t i = from_index; i < characters_; i++) {

	2000 QuickCheckDetails::Position* pos = positions(i);

	2001 QuickCheckDetails::Position* other_pos = other->positions(i);

	2002 if (pos->mask != other_pos->mask \|\|

	2003 pos->value != other_pos->value \|\|

	2004 !other_pos->determines_perfectly) {

	2005 // Our mask-compare operation will be approximate unless we have the

	2006 // exact same operation on both sides of the alternation.

	2007 pos->determines_perfectly = false;

	2008 }

	2009 pos->mask &= other_pos->mask;

	2010 pos->value &= pos->mask;

	2011 other_pos->value &= pos->mask;

	2012 uint16_t differing_bits = (pos->value ^ other_pos->value);

	2013 pos->mask &= ~differing_bits;

	2014 pos->value &= pos->mask;

	2015 }

	2016 }

	2017

	2018

	2019 class VisitMarker : public ValueObject {

	2020 public:

	2021 explicit VisitMarker(NodeInfo* info) : info_(info) {

	2022 ASSERT(!info->visited);

	2023 info->visited = true;

	2024 }

	2025 ~VisitMarker() {

	2026 info_->visited = false;

	2027 }

	2028 private:

	2029 NodeInfo* info_;

	2030 };

	2031

	2032

	2033 RegExpNode* SeqRegExpNode::FilterOneByte(intptr_t depth, bool ignore_case) {

	2034 if (info()->replacement_calculated) return replacement();

	2035 if (depth < 0) return this;

	2036 ASSERT(!info()->visited);

	2037 VisitMarker marker(info());

	2038 return FilterSuccessor(depth - 1, ignore_case);

	2039 }

	2040

	2041

	2042 RegExpNode* SeqRegExpNode::FilterSuccessor(intptr_t depth, bool ignore_case) {

	2043 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);

	2044 if (next == NULL) return set_replacement(NULL);

	2045 on_success_ = next;

	2046 return set_replacement(this);

	2047 }

	2048

	2049

	2050 // We need to check for the following characters: 0x39c 0x3bc 0x178.

	2051 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {

	2052 // TODO(dcarney): this could be a lot more efficient.

	2053 return range.Contains(0x39c) \|\|

	2054 range.Contains(0x3bc) \|\| range.Contains(0x178);

	2055 }

	2056

	2057

	2058 static bool RangesContainLatin1Equivalents(

	2059 ZoneGrowableArray<CharacterRange>* ranges) {

	2060 for (intptr_t i = 0; i < ranges->length(); i++) {

	2061 // TODO(dcarney): this could be a lot more efficient.

	2062 if (RangeContainsLatin1Equivalents(ranges->At(i))) return true;

	2063 }

	2064 return false;

	2065 }

	2066

	2067

	2068 static uint16_t ConvertNonLatin1ToLatin1(uint16_t c) {

	2069 ASSERT(c > Symbols::kMaxOneCharCodeSymbol);

	2070 switch (c) {

	2071 // This are equivalent characters in unicode.

	2072 case 0x39c:

	2073 case 0x3bc:

	2074 return 0xb5;

	2075 // This is an uppercase of a Latin-1 character

	2076 // outside of Latin-1.

	2077 case 0x178:

	2078 return 0xff;

	2079 }

	2080 return 0;

	2081 }

	2082

	2083

	2084 RegExpNode* TextNode::FilterOneByte(intptr_t depth, bool ignore_case) {

	2085 if (info()->replacement_calculated) return replacement();

	2086 if (depth < 0) return this;

	2087 ASSERT(!info()->visited);

	2088 VisitMarker marker(info());

	2089 intptr_t element_count = elms_->length();

	2090 for (intptr_t i = 0; i < element_count; i++) {

	2091 TextElement elm = elms_->At(i);

	2092 if (elm.text_type() == TextElement::ATOM) {

	2093 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();

	2094 for (intptr_t j = 0; j < quarks->length(); j++) {

	2095 uint16_t c = quarks->At(j);

	2096 if (c <= Symbols::kMaxOneCharCodeSymbol) continue;

	2097 if (!ignore_case) return set_replacement(NULL);

	2098 // Here, we need to check for characters whose upper and lower cases

	2099 // are outside the Latin-1 range.

	2100 uint16_t converted = ConvertNonLatin1ToLatin1(c);

	2101 // Character is outside Latin-1 completely

	2102 if (converted == 0) return set_replacement(NULL);

	2103 // Convert quark to Latin-1 in place.

	2104 (*quarks)[0] = converted;

	2105 }

	2106 } else {

	2107 ASSERT(elm.text_type() == TextElement::CHAR_CLASS);

	2108 RegExpCharacterClass* cc = elm.char_class();

	2109 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();

	2110 if (!CharacterRange::IsCanonical(ranges)) {

	2111 CharacterRange::Canonicalize(ranges);

	2112 }

	2113 // Now they are in order so we only need to look at the first.

	2114 intptr_t range_count = ranges->length();

	2115 if (cc->is_negated()) {

	2116 if (range_count != 0 &&

	2117 ranges->At(0).from() == 0 &&

	2118 ranges->At(0).to() >= Symbols::kMaxOneCharCodeSymbol) {

	2119 // This will be handled in a later filter.

	2120 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;

	2121 return set_replacement(NULL);

	2122 }

	2123 } else {

	2124 if (range_count == 0 \|\|

	2125 ranges->At(0).from() > Symbols::kMaxOneCharCodeSymbol) {

	2126 // This will be handled in a later filter.

	2127 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;

	2128 return set_replacement(NULL);

	2129 }

	2130 }

	2131 }

	2132 }

	2133 return FilterSuccessor(depth - 1, ignore_case);

	2134 }

	2135

	2136

	2137 RegExpNode* LoopChoiceNode::FilterOneByte(intptr_t depth, bool ignore_case) {

	2138 if (info()->replacement_calculated) return replacement();

	2139 if (depth < 0) return this;

	2140 if (info()->visited) return this;

	2141 {

	2142 VisitMarker marker(info());

	2143

	2144 RegExpNode* continue_replacement =

	2145 continue_node_->FilterOneByte(depth - 1, ignore_case);

	2146 // If we can't continue after the loop then there is no sense in doing the

	2147 // loop.

	2148 if (continue_replacement == NULL) return set_replacement(NULL);

	2149 }

	2150

	2151 return ChoiceNode::FilterOneByte(depth - 1, ignore_case);

	2152 }

	2153

	2154

	2155 RegExpNode* ChoiceNode::FilterOneByte(intptr_t depth, bool ignore_case) {

	2156 if (info()->replacement_calculated) return replacement();

	2157 if (depth < 0) return this;

	2158 if (info()->visited) return this;

	2159 VisitMarker marker(info());

	2160 intptr_t choice_count = alternatives_->length();

	2161

	2162 for (intptr_t i = 0; i < choice_count; i++) {

	2163 GuardedAlternative alternative = alternatives_->At(i);

	2164 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {

	2165 set_replacement(this);

	2166 return this;

	2167 }

	2168 }

	2169

	2170 intptr_t surviving = 0;

	2171 RegExpNode* survivor = NULL;

	2172 for (intptr_t i = 0; i < choice_count; i++) {

	2173 GuardedAlternative alternative = alternatives_->At(i);

	2174 RegExpNode* replacement =

	2175 alternative.node()->FilterOneByte(depth - 1, ignore_case);

	2176 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.

	2177 if (replacement != NULL) {

	2178 (*alternatives_)[i].set_node(replacement);

	2179 surviving++;

	2180 survivor = replacement;

	2181 }

	2182 }

	2183 if (surviving < 2) return set_replacement(survivor);

	2184

	2185 set_replacement(this);

	2186 if (surviving == choice_count) {

	2187 return this;

	2188 }

	2189 // Only some of the nodes survived the filtering. We need to rebuild the

	2190 // alternatives list.

	2191 ZoneGrowableArray<GuardedAlternative>* new_alternatives =

	2192 new(I) ZoneGrowableArray<GuardedAlternative>(surviving);

	2193 for (intptr_t i = 0; i < choice_count; i++) {

	2194 RegExpNode* replacement =

	2195 (*alternatives_)[i].node()->FilterOneByte(depth - 1, ignore_case);

	2196 if (replacement != NULL) {

	2197 (*alternatives_)[i].set_node(replacement);

	2198 new_alternatives->Add((*alternatives_)[i]);

	2199 }

	2200 }

	2201 alternatives_ = new_alternatives;

	2202 return this;

	2203 }

	2204

	2205

	2206 RegExpNode* NegativeLookaheadChoiceNode::FilterOneByte(intptr_t depth,

	2207 bool ignore_case) {

	2208 if (info()->replacement_calculated) return replacement();

	2209 if (depth < 0) return this;

	2210 if (info()->visited) return this;

	2211 VisitMarker marker(info());

	2212 // Alternative 0 is the negative lookahead, alternative 1 is what comes

	2213 // afterwards.

	2214 RegExpNode* node = (*alternatives_)[1].node();

	2215 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);

	2216 if (replacement == NULL) return set_replacement(NULL);

	2217 (*alternatives_)[1].set_node(replacement);

	2218

	2219 RegExpNode* neg_node = (*alternatives_)[0].node();

	2220 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);

	2221 // If the negative lookahead is always going to fail then

	2222 // we don't need to check it.

	2223 if (neg_replacement == NULL) return set_replacement(replacement);

	2224 (*alternatives_)[0].set_node(neg_replacement);

	2225 return set_replacement(this);

	2226 }

	2227

	2228

	2229 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

	2230 RegExpCompiler* compiler,

	2231 intptr_t characters_filled_in,

	2232 bool not_at_start) {

	2233 if (body_can_be_zero_length_ \|\| info()->visited) return;

	2234 VisitMarker marker(info());

	2235 return ChoiceNode::GetQuickCheckDetails(details,

	2236 compiler,

	2237 characters_filled_in,

	2238 not_at_start);

	2239 }

	2240

	2241

	2242 void LoopChoiceNode::FillInBMInfo(intptr_t offset,

	2243 intptr_t budget,

	2244 BoyerMooreLookahead* bm,

	2245 bool not_at_start) {

	2246 if (body_can_be_zero_length_ \|\| budget <= 0) {

	2247 bm->SetRest(offset);

	2248 SaveBMInfo(bm, not_at_start, offset);

	2249 return;

	2250 }

	2251 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start);

	2252 SaveBMInfo(bm, not_at_start, offset);

	2253 }

	2254

	2255

	2256 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

	2257 RegExpCompiler* compiler,

	2258 intptr_t characters_filled_in,

	2259 bool not_at_start) {

	2260 not_at_start = (not_at_start \|\| not_at_start_);

	2261 intptr_t choice_count = alternatives_->length();

	2262 ASSERT(choice_count > 0);

	2263 (*alternatives_)[0].node()->GetQuickCheckDetails(details,

	2264 compiler,

	2265 characters_filled_in,

	2266 not_at_start);

	2267 for (intptr_t i = 1; i < choice_count; i++) {

	2268 QuickCheckDetails new_details(details->characters());

	2269 RegExpNode* node = (*alternatives_)[i].node();

	2270 node->GetQuickCheckDetails(&new_details, compiler,

	2271 characters_filled_in,

	2272 not_at_start);

	2273 // Here we merge the quick match details of the two branches.

	2274 details->Merge(&new_details, characters_filled_in);

	2275 }

	2276 }

	2277

	2278

	2279 // Check for [0-9A-Z_a-z].

	2280 static void EmitWordCheck(RegExpMacroAssembler* assembler,

	2281 BlockLabel* word,

	2282 BlockLabel* non_word,

	2283 bool fall_through_on_word) {

	2284 if (assembler->CheckSpecialCharacterClass(

	2285 fall_through_on_word ? 'w' : 'W',

	2286 fall_through_on_word ? non_word : word)) {

	2287 // Optimized implementation available.

	2288 return;

	2289 }

	2290 assembler->CheckCharacterGT('z', non_word);

	2291 assembler->CheckCharacterLT('0', non_word);

	2292 assembler->CheckCharacterGT('a' - 1, word);

	2293 assembler->CheckCharacterLT('9' + 1, word);

	2294 assembler->CheckCharacterLT('A', non_word);

	2295 assembler->CheckCharacterLT('Z' + 1, word);

	2296 if (fall_through_on_word) {

	2297 assembler->CheckNotCharacter('_', non_word);

	2298 } else {

	2299 assembler->CheckCharacter('_', word);

	2300 }

	2301 }

	2302

	2303

	2304 // Emit the code to check for a ^ in multiline mode (1-character lookbehind

	2305 // that matches newline or the start of input).

	2306 static void EmitHat(RegExpCompiler* compiler,

	2307 RegExpNode* on_success,

	2308 Trace* trace) {

	2309 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	2310 // We will be loading the previous character into the current character

	2311 // register.

	2312 Trace new_trace(*trace);

	2313 new_trace.InvalidateCurrentCharacter();

	2314

	2315 BlockLabel ok;

	2316 if (new_trace.cp_offset() == 0) {

	2317 // The start of input counts as a newline in this context, so skip to

	2318 // ok if we are at the start.

	2319 assembler->CheckAtStart(&ok);

	2320 }

	2321 // We already checked that we are not at the start of input so it must be

	2322 // OK to load the previous character.

	2323 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,

	2324 new_trace.backtrack(),

	2325 false);

	2326 if (!assembler->CheckSpecialCharacterClass('n',

	2327 new_trace.backtrack())) {

	2328 // Newline means \n, \r, 0x2028 or 0x2029.

	2329 if (!compiler->one_byte()) {

	2330 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);

	2331 }

	2332 assembler->CheckCharacter('\n', &ok);

	2333 assembler->CheckNotCharacter('\r', new_trace.backtrack());

	2334 }

	2335 assembler->BindBlock(&ok);

	2336 on_success->Emit(compiler, &new_trace);

	2337 }

	2338

	2339

	2340 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).

	2341 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {

	2342 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	2343 Trace::TriBool next_is_word_character = Trace::UNKNOWN;

	2344 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);

	2345 BoyerMooreLookahead* lookahead = bm_info(not_at_start);

	2346 if (lookahead == NULL) {

	2347 intptr_t eats_at_least =

	2348 Utils::Minimum(kMaxLookaheadForBoyerMoore,

	2349 EatsAtLeast(kMaxLookaheadForBoyerMoore,

	2350 kRecursionBudget,

	2351 not_at_start));

	2352 if (eats_at_least >= 1) {

	2353 BoyerMooreLookahead* bm =

	2354 new(I) BoyerMooreLookahead(eats_at_least, compiler, I);

	2355 FillInBMInfo(0, kRecursionBudget, bm, not_at_start);

	2356 if (bm->at(0)->is_non_word())

	2357 next_is_word_character = Trace::FALSE_VALUE;

	2358 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;

	2359 }

	2360 } else {

	2361 if (lookahead->at(0)->is_non_word())

	2362 next_is_word_character = Trace::FALSE_VALUE;

	2363 if (lookahead->at(0)->is_word())

	2364 next_is_word_character = Trace::TRUE_VALUE;

	2365 }

	2366 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);

	2367 if (next_is_word_character == Trace::UNKNOWN) {

	2368 BlockLabel before_non_word;

	2369 BlockLabel before_word;

	2370 if (trace->characters_preloaded() != 1) {

	2371 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);

	2372 }

	2373 // Fall through on non-word.

	2374 EmitWordCheck(assembler, &before_word, &before_non_word, false);

	2375 // Next character is not a word character.

	2376 assembler->BindBlock(&before_non_word);

	2377 BlockLabel ok;

	2378 // Backtrack on \B (non-boundary check) if previous is a word,

	2379 // since we know next is not a word and this would be a boundary.

	2380 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);

	2381

	2382 if (!assembler->IsClosed()) {

	2383 assembler->GoTo(&ok);

	2384 }

	2385

	2386 assembler->BindBlock(&before_word);

	2387 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);

	2388 assembler->BindBlock(&ok);

	2389 } else if (next_is_word_character == Trace::TRUE_VALUE) {

	2390 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);

	2391 } else {

	2392 ASSERT(next_is_word_character == Trace::FALSE_VALUE);

	2393 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);

	2394 }

	2395 }

	2396

	2397

	2398 void AssertionNode::BacktrackIfPrevious(

	2399 RegExpCompiler* compiler,

	2400 Trace* trace,

	2401 AssertionNode::IfPrevious backtrack_if_previous) {

	2402 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	2403 Trace new_trace(*trace);

	2404 new_trace.InvalidateCurrentCharacter();

	2405

	2406 BlockLabel fall_through, dummy;

	2407

	2408 BlockLabel* non_word = backtrack_if_previous == kIsNonWord ?

	2409 new_trace.backtrack() :

	2410 &fall_through;

	2411 BlockLabel* word = backtrack_if_previous == kIsNonWord ?

	2412 &fall_through :

	2413 new_trace.backtrack();

	2414

	2415 if (new_trace.cp_offset() == 0) {

	2416 // The start of input counts as a non-word character, so the question is

	2417 // decided if we are at the start.

	2418 assembler->CheckAtStart(non_word);

	2419 }

	2420 // We already checked that we are not at the start of input so it must be

	2421 // OK to load the previous character.

	2422 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);

	2423 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);

	2424

	2425 assembler->BindBlock(&fall_through);

	2426 on_success()->Emit(compiler, &new_trace);

	2427 }

	2428

	2429

	2430 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,

	2431 RegExpCompiler* compiler,

	2432 intptr_t filled_in,

	2433 bool not_at_start) {

	2434 if (assertion_type_ == AT_START && not_at_start) {

	2435 details->set_cannot_match();

	2436 return;

	2437 }

	2438 return on_success()->GetQuickCheckDetails(details,

	2439 compiler,

	2440 filled_in,

	2441 not_at_start);

	2442 }

	2443

	2444

	2445 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {

	2446 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	2447 switch (assertion_type_) {

	2448 case AT_END: {

	2449 BlockLabel ok;

	2450 assembler->CheckPosition(trace->cp_offset(), &ok);

	2451 assembler->GoTo(trace->backtrack());

	2452 assembler->BindBlock(&ok);

	2453 break;

	2454 }

	2455 case AT_START: {

	2456 if (trace->at_start() == Trace::FALSE_VALUE) {

	2457 assembler->GoTo(trace->backtrack());

	2458 return;

	2459 }

	2460 if (trace->at_start() == Trace::UNKNOWN) {

	2461 assembler->CheckNotAtStart(trace->backtrack());

	2462 Trace at_start_trace = *trace;

	2463 at_start_trace.set_at_start(true);

	2464 on_success()->Emit(compiler, &at_start_trace);

	2465 return;

	2466 }

	2467 }

	2468 break;

	2469 case AFTER_NEWLINE:

	2470 EmitHat(compiler, on_success(), trace);

	2471 return;

	2472 case AT_BOUNDARY:

	2473 case AT_NON_BOUNDARY: {

	2474 EmitBoundaryCheck(compiler, trace);

	2475 return;

	2476 }

	2477 }

	2478 on_success()->Emit(compiler, trace);

	2479 }

	2480

	2481

	2482 static bool DeterminedAlready(QuickCheckDetails* quick_check, intptr_t offset) {

	2483 if (quick_check == NULL) return false;

	2484 if (offset >= quick_check->characters()) return false;

	2485 return quick_check->positions(offset)->determines_perfectly;

	2486 }

	2487

	2488

	2489 static void UpdateBoundsCheck(intptr_t index, intptr_t* checked_up_to) {

	2490 if (index > *checked_up_to) {

	2491 *checked_up_to = index;

	2492 }

	2493 }

	2494

	2495

	2496 // We call this repeatedly to generate code for each pass over the text node.

	2497 // The passes are in increasing order of difficulty because we hope one

	2498 // of the first passes will fail in which case we are saved the work of the

	2499 // later passes. for example for the case independent regexp /%[asdfghjkl]a/

	2500 // we will check the '%' in the first pass, the case independent 'a' in the

	2501 // second pass and the character class in the last pass.

	2502 //

	2503 // The passes are done from right to left, so for example to test for /bar/

	2504 // we will first test for an 'r' with offset 2, then an 'a' with offset 1

	2505 // and then a 'b' with offset 0. This means we can avoid the end-of-input

	2506 // bounds check most of the time. In the example we only need to check for

	2507 // end-of-input when loading the putative 'r'.

	2508 //

	2509 // A slight complication involves the fact that the first character may already

	2510 // be fetched into a register by the previous node. In this case we want to

	2511 // do the test for that character first. We do this in separate passes. The

	2512 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a

	2513 // pass has been performed then subsequent passes will have true in

	2514 // first_element_checked to indicate that that character does not need to be

	2515 // checked again.

	2516 //

	2517 // In addition to all this we are passed a Trace, which can

	2518 // contain an AlternativeGeneration object. In this AlternativeGeneration

	2519 // object we can see details of any quick check that was already passed in

	2520 // order to get to the code we are now generating. The quick check can involve

	2521 // loading characters, which means we do not need to recheck the bounds

	2522 // up to the limit the quick check already checked. In addition the quick

	2523 // check can have involved a mask and compare operation which may simplify

	2524 // or obviate the need for further checks at some character positions.

	2525 void TextNode::TextEmitPass(RegExpCompiler* compiler,

	2526 TextEmitPassType pass,

	2527 bool preloaded,

	2528 Trace* trace,

	2529 bool first_element_checked,

	2530 intptr_t* checked_up_to) {

	2531 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	2532 bool one_byte = compiler->one_byte();

	2533 BlockLabel* backtrack = trace->backtrack();

	2534 QuickCheckDetails* quick_check = trace->quick_check_performed();

	2535 intptr_t element_count = elms_->length();

	2536 for (intptr_t i = preloaded ? 0 : element_count - 1; i >= 0; i--) {

	2537 TextElement elm = elms_->At(i);

	2538 intptr_t cp_offset = trace->cp_offset() + elm.cp_offset();

	2539 if (elm.text_type() == TextElement::ATOM) {

	2540 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();

	2541 for (intptr_t j = preloaded ? 0 : quarks->length() - 1; j >= 0; j--) {

	2542 if (first_element_checked && i == 0 && j == 0) continue;

	2543 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;

	2544 EmitCharacterFunction* emit_function = NULL;

	2545 switch (pass) {

	2546 case NON_LATIN1_MATCH:

	2547 ASSERT(one_byte);

	2548 if (quarks->At(j) > Symbols::kMaxOneCharCodeSymbol) {

	2549 assembler->GoTo(backtrack);

	2550 return;

	2551 }

	2552 break;

	2553 case NON_LETTER_CHARACTER_MATCH:

	2554 emit_function = &EmitAtomNonLetter;

	2555 break;

	2556 case SIMPLE_CHARACTER_MATCH:

	2557 emit_function = &EmitSimpleCharacter;

	2558 break;

	2559 case CASE_CHARACTER_MATCH:

	2560 emit_function = &EmitAtomLetter;

	2561 break;

	2562 default:

	2563 break;

	2564 }

	2565 if (emit_function != NULL) {

	2566 bool bound_checked = emit_function(I,

	2567 compiler,

	2568 quarks->At(j),

	2569 backtrack,

	2570 cp_offset + j,

	2571 *checked_up_to < cp_offset + j,

	2572 preloaded);

	2573 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);

	2574 }

	2575 }

	2576 } else {

	2577 ASSERT(elm.text_type() == TextElement::CHAR_CLASS);

	2578 if (pass == CHARACTER_CLASS_MATCH) {

	2579 if (first_element_checked && i == 0) continue;

	2580 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;

	2581 RegExpCharacterClass* cc = elm.char_class();

	2582 EmitCharClass(assembler,

	2583 cc,

	2584 one_byte,

	2585 backtrack,

	2586 cp_offset,

	2587 *checked_up_to < cp_offset,

	2588 preloaded,

	2589 I);

	2590 UpdateBoundsCheck(cp_offset, checked_up_to);

	2591 }

	2592 }

	2593 }

	2594 }

	2595

	2596

	2597 intptr_t TextNode::Length() {

	2598 TextElement elm = elms_->Last();

	2599 ASSERT(elm.cp_offset() >= 0);

	2600 return elm.cp_offset() + elm.length();

	2601 }

	2602

	2603

	2604 bool TextNode::SkipPass(intptr_t intptr_t_pass, bool ignore_case) {

	2605 TextEmitPassType pass = static_cast<TextEmitPassType>(intptr_t_pass);

	2606 if (ignore_case) {

	2607 return pass == SIMPLE_CHARACTER_MATCH;

	2608 } else {

	2609 return pass == NON_LETTER_CHARACTER_MATCH \|\| pass == CASE_CHARACTER_MATCH;

	2610 }

	2611 }

	2612

	2613

	2614 // This generates the code to match a text node. A text node can contain

	2615 // straight character sequences (possibly to be matched in a case-independent

	2616 // way) and character classes. For efficiency we do not do this in a single

	2617 // pass from left to right. Instead we pass over the text node several times,

	2618 // emitting code for some character positions every time. See the comment on

	2619 // TextEmitPass for details.

	2620 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {

	2621 LimitResult limit_result = LimitVersions(compiler, trace);

	2622 if (limit_result == DONE) return;

	2623 ASSERT(limit_result == CONTINUE);

	2624

	2625 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {

	2626 compiler->SetRegExpTooBig();

	2627 return;

	2628 }

	2629

	2630 if (compiler->one_byte()) {

	2631 intptr_t dummy = 0;

	2632 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);

	2633 }

	2634

	2635 bool first_elt_done = false;

	2636 intptr_t bound_checked_to = trace->cp_offset() - 1;

	2637 bound_checked_to += trace->bound_checked_up_to();

	2638

	2639 // If a character is preloaded into the current character register then

	2640 // check that now.

	2641 if (trace->characters_preloaded() == 1) {

	2642 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {

	2643 if (!SkipPass(pass, compiler->ignore_case())) {

	2644 TextEmitPass(compiler,

	2645 static_cast<TextEmitPassType>(pass),

	2646 true,

	2647 trace,

	2648 false,

	2649 &bound_checked_to);

	2650 }

	2651 }

	2652 first_elt_done = true;

	2653 }

	2654

	2655 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {

	2656 if (!SkipPass(pass, compiler->ignore_case())) {

	2657 TextEmitPass(compiler,

	2658 static_cast<TextEmitPassType>(pass),

	2659 false,

	2660 trace,

	2661 first_elt_done,

	2662 &bound_checked_to);

	2663 }

	2664 }

	2665

	2666 Trace successor_trace(*trace);

	2667 successor_trace.set_at_start(false);

	2668 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);

	2669 RecursionCheck rc(compiler);

	2670 on_success()->Emit(compiler, &successor_trace);

	2671 }

	2672

	2673

	2674 void Trace::InvalidateCurrentCharacter() {

	2675 characters_preloaded_ = 0;

	2676 }

	2677

	2678

	2679 void Trace::AdvanceCurrentPositionInTrace(intptr_t by,

	2680 RegExpCompiler* compiler) {

	2681 ASSERT(by > 0);

	2682 // We don't have an instruction for shifting the current character register

	2683 // down or for using a shifted value for anything so lets just forget that

	2684 // we preloaded any characters into it.

	2685 characters_preloaded_ = 0;

	2686 // Adjust the offsets of the quick check performed information. This

	2687 // information is used to find out what we already determined about the

	2688 // characters by means of mask and compare.

	2689 quick_check_performed_.Advance(by, compiler->one_byte());

	2690 cp_offset_ += by;

	2691 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {

	2692 compiler->SetRegExpTooBig();

	2693 cp_offset_ = 0;

	2694 }

	2695 bound_checked_up_to_ = Utils::Maximum(static_cast<intptr_t>(0),

	2696 bound_checked_up_to_ - by);

	2697 }

	2698

	2699

	2700 void TextNode::MakeCaseIndependent(bool is_one_byte) {

	2701 intptr_t element_count = elms_->length();

	2702 for (intptr_t i = 0; i < element_count; i++) {

	2703 TextElement elm = elms_->At(i);

	2704 if (elm.text_type() == TextElement::CHAR_CLASS) {

	2705 RegExpCharacterClass* cc = elm.char_class();

	2706 // None of the standard character classes is different in the case

	2707 // independent case and it slows us down if we don't know that.

	2708 if (cc->is_standard()) continue;

	2709 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();

	2710 intptr_t range_count = ranges->length();

	2711 for (intptr_t j = 0; j < range_count; j++) {

	2712 (*ranges)[j].AddCaseEquivalents(ranges, is_one_byte, I);

	2713 }

	2714 }

	2715 }

	2716 }

	2717

	2718

	2719 intptr_t TextNode::GreedyLoopTextLength() {

	2720 TextElement elm = elms_->At(elms_->length() - 1);

	2721 return elm.cp_offset() + elm.length();

	2722 }

	2723

	2724

	2725 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(

	2726 RegExpCompiler* compiler) {

	2727 if (elms_->length() != 1) return NULL;

	2728 TextElement elm = elms_->At(0);

	2729 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;

	2730 RegExpCharacterClass* node = elm.char_class();

	2731 ZoneGrowableArray<CharacterRange>* ranges = node->ranges();

	2732 if (!CharacterRange::IsCanonical(ranges)) {

	2733 CharacterRange::Canonicalize(ranges);

	2734 }

	2735 if (node->is_negated()) {

	2736 return ranges->length() == 0 ? on_success() : NULL;

	2737 }

	2738 if (ranges->length() != 1) return NULL;

	2739 uint32_t max_char;

	2740 if (compiler->one_byte()) {

	2741 max_char = Symbols::kMaxOneCharCodeSymbol;

	2742 } else {

	2743 max_char = Utf16::kMaxCodeUnit;

	2744 }

	2745 return ranges->At(0).IsEverything(max_char) ? on_success() : NULL;

	2746 }

	2747

	2748

	2749 // Finds the fixed match length of a sequence of nodes that goes from

	2750 // this alternative and back to this choice node. If there are variable

	2751 // length nodes or other complications in the way then return a sentinel

	2752 // value indicating that a greedy loop cannot be constructed.

	2753 intptr_t ChoiceNode::GreedyLoopTextLengthForAlternative(

	2754 GuardedAlternative* alternative) {

	2755 intptr_t length = 0;

	2756 RegExpNode* node = alternative->node();

	2757 // Later we will generate code for all these text nodes using recursion

	2758 // so we have to limit the max number.

	2759 intptr_t recursion_depth = 0;

	2760 while (node != this) {

	2761 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {

	2762 return kNodeIsTooComplexForGreedyLoops;

	2763 }

	2764 intptr_t node_length = node->GreedyLoopTextLength();

	2765 if (node_length == kNodeIsTooComplexForGreedyLoops) {

	2766 return kNodeIsTooComplexForGreedyLoops;

	2767 }

	2768 length += node_length;

	2769 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);

	2770 node = seq_node->on_success();

	2771 }

	2772 return length;

	2773 }

	2774

	2775

	2776 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {

	2777 ASSERT(loop_node_ == NULL);

	2778 AddAlternative(alt);

	2779 loop_node_ = alt.node();

	2780 }

	2781

	2782

	2783 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {

	2784 ASSERT(continue_node_ == NULL);

	2785 AddAlternative(alt);

	2786 continue_node_ = alt.node();

	2787 }

	2788

	2789

	2790 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {

	2791 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	2792 if (trace->stop_node() == this) {

	2793 // Back edge of greedy optimized loop node graph.

	2794 intptr_t text_length =

	2795 GreedyLoopTextLengthForAlternative(&((*alternatives_)[0]));

	2796 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);

	2797 // Update the counter-based backtracking info on the stack. This is an

	2798 // optimization for greedy loops (see below).

	2799 ASSERT(trace->cp_offset() == text_length);

	2800 macro_assembler->AdvanceCurrentPosition(text_length);

	2801 macro_assembler->GoTo(trace->loop_label());

	2802 return;

	2803 }

	2804 ASSERT(trace->stop_node() == NULL);

	2805 if (!trace->is_trivial()) {

	2806 trace->Flush(compiler, this);

	2807 return;

	2808 }

	2809 ChoiceNode::Emit(compiler, trace);

	2810 }

	2811

	2812

	2813 intptr_t ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,

	2814 intptr_t eats_at_least) {

	2815 intptr_t preload_characters = Utils::Minimum(static_cast<intptr_t>(4),

	2816 eats_at_least);

	2817 if (compiler->macro_assembler()->CanReadUnaligned()) {

	2818 bool one_byte = compiler->one_byte();

	2819 if (one_byte) {

	2820 if (preload_characters > 4) preload_characters = 4;

	2821 // We can't preload 3 characters because there is no machine instruction

	2822 // to do that. We can't just load 4 because we could be reading

	2823 // beyond the end of the string, which could cause a memory fault.

	2824 if (preload_characters == 3) preload_characters = 2;

	2825 } else {

	2826 if (preload_characters > 2) preload_characters = 2;

	2827 }

	2828 } else {

	2829 if (preload_characters > 1) preload_characters = 1;

	2830 }

	2831 return preload_characters;

	2832 }

	2833

	2834

	2835 // This structure is used when generating the alternatives in a choice node. It

	2836 // records the way the alternative is being code generated.

	2837 struct AlternativeGeneration {

	2838 AlternativeGeneration()

	2839 : possible_success(),

	2840 expects_preload(false),

	2841 after(),

	2842 quick_check_details() { }

	2843 BlockLabel possible_success;

	2844 bool expects_preload;

	2845 BlockLabel after;

	2846 QuickCheckDetails quick_check_details;

	2847 };

	2848

	2849

	2850 // Creates a list of AlternativeGenerations. If the list has a reasonable

	2851 // size then it is on the stack, otherwise the excess is on the heap.

	2852 class AlternativeGenerationList {

	2853 public:

	2854 explicit AlternativeGenerationList(intptr_t count)

	2855 : alt_gens_(count) {

	2856 for (intptr_t i = 0; i < count && i < kAFew; i++) {

	2857 alt_gens_.Add(a_few_alt_gens_ + i);

	2858 }

	2859 for (intptr_t i = kAFew; i < count; i++) {

	2860 alt_gens_.Add(new AlternativeGeneration());

	2861 }

	2862 }

	2863 ~AlternativeGenerationList() {

	2864 for (intptr_t i = kAFew; i < alt_gens_.length(); i++) {

	2865 delete alt_gens_[i];

	2866 alt_gens_[i] = NULL;

	2867 }

	2868 }

	2869

	2870 AlternativeGeneration* at(intptr_t i) {

	2871 return alt_gens_[i];

	2872 }

	2873

	2874 private:

	2875 static const intptr_t kAFew = 10;

	2876 GrowableArray<AlternativeGeneration*> alt_gens_;

	2877 AlternativeGeneration a_few_alt_gens_[kAFew];

	2878

	2879 DISALLOW_ALLOCATION();

	2880 };

	2881

	2882

	2883 // The '2' variant is inclusive from and exclusive to.

	2884 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,

	2885 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.

	2886 static const intptr_t kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,

	2887 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,

	2888 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,

	2889 0xFEFF, 0xFF00, 0x10000 };

	2890 static const intptr_t kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);

	2891 static const intptr_t kWordRanges[] = {

	2892 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };

	2893 static const intptr_t kWordRangeCount = ARRAY_SIZE(kWordRanges);

	2894 static const intptr_t kDigitRanges[] = { '0', '9' + 1, 0x10000 };

	2895 static const intptr_t kDigitRangeCount = ARRAY_SIZE(kDigitRanges);

	2896 static const intptr_t kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };

	2897 static const intptr_t kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges);

	2898 static const intptr_t kLineTerminatorRanges[] = {

	2899 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, 0x10000 };

	2900 static const intptr_t kLineTerminatorRangeCount =

	2901 ARRAY_SIZE(kLineTerminatorRanges);

	2902

	2903

	2904 void BoyerMoorePositionInfo::Set(intptr_t character) {

	2905 SetInterval(Interval(character, character));

	2906 }

	2907

	2908

	2909 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {

	2910 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);

	2911 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);

	2912 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);

	2913 surrogate_ =

	2914 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);

	2915 if (interval.to() - interval.from() >= kMapSize - 1) {

	2916 if (map_count_ != kMapSize) {

	2917 map_count_ = kMapSize;

	2918 for (intptr_t i = 0; i < kMapSize; i++) (*map_)[i] = true;

	2919 }

	2920 return;

	2921 }

	2922 for (intptr_t i = interval.from(); i <= interval.to(); i++) {

	2923 intptr_t mod_character = (i & kMask);

	2924 if (!map_->At(mod_character)) {

	2925 map_count_++;

	2926 (*map_)[mod_character] = true;

	2927 }

	2928 if (map_count_ == kMapSize) return;

	2929 }

	2930 }

	2931

	2932

	2933 void BoyerMoorePositionInfo::SetAll() {

	2934 s_ = w_ = d_ = kLatticeUnknown;

	2935 if (map_count_ != kMapSize) {

	2936 map_count_ = kMapSize;

	2937 for (intptr_t i = 0; i < kMapSize; i++) (*map_)[i] = true;

	2938 }

	2939 }

	2940

	2941

	2942 BoyerMooreLookahead::BoyerMooreLookahead(

	2943 intptr_t length, RegExpCompiler* compiler, Isolate* isolate)

	2944 : length_(length),

	2945 compiler_(compiler) {

	2946 if (compiler->one_byte()) {

	2947 max_char_ = Symbols::kMaxOneCharCodeSymbol;

	2948 } else {

	2949 max_char_ = Utf16::kMaxCodeUnit;

	2950 }

	2951 bitmaps_ = new(isolate) ZoneGrowableArray<BoyerMoorePositionInfo*>(length);

	2952 for (intptr_t i = 0; i < length; i++) {

	2953 bitmaps_->Add(new(isolate) BoyerMoorePositionInfo(isolate));

	2954 }

	2955 }

	2956

	2957

	2958 // Find the longest range of lookahead that has the fewest number of different

	2959 // characters that can occur at a given position. Since we are optimizing two

	2960 // different parameters at once this is a tradeoff.

	2961 bool BoyerMooreLookahead::FindWorthwhileInterval(intptr_t* from, intptr_t* to) {

	2962 intptr_t biggest_points = 0;

	2963 // If more than 32 characters out of 128 can occur it is unlikely that we can

	2964 // be lucky enough to step forwards much of the time.

	2965 const intptr_t kMaxMax = 32;

	2966 for (intptr_t max_number_of_chars = 4;

	2967 max_number_of_chars < kMaxMax;

	2968 max_number_of_chars *= 2) {

	2969 biggest_points =

	2970 FindBestInterval(max_number_of_chars, biggest_points, from, to);

	2971 }

	2972 if (biggest_points == 0) return false;

	2973 return true;

	2974 }

	2975

	2976

	2977 // Find the highest-points range between 0 and length_ where the character

	2978 // information is not too vague. 'Too vague' means that there are more than

	2979 // max_number_of_chars that can occur at this position. Calculates the number

	2980 // of points as the product of width-of-the-range and

	2981 // probability-of-finding-one-of-the-characters, where the probability is

	2982 // calculated using the frequency distribution of the sample subject string.

	2983 intptr_t BoyerMooreLookahead::FindBestInterval(

	2984 intptr_t max_number_of_chars,

	2985 intptr_t old_biggest_points,

	2986 intptr_t* from,

	2987 intptr_t* to) {

	2988 intptr_t biggest_points = old_biggest_points;

	2989 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;

	2990 for (intptr_t i = 0; i < length_; ) {

	2991 while (i < length_ && Count(i) > max_number_of_chars) i++;

	2992 if (i == length_) break;

	2993 intptr_t remembered_from = i;

	2994 bool union_map[kSize];

	2995 for (intptr_t j = 0; j < kSize; j++) union_map[j] = false;

	2996 while (i < length_ && Count(i) <= max_number_of_chars) {

	2997 BoyerMoorePositionInfo* map = bitmaps_->At(i);

	2998 for (intptr_t j = 0; j < kSize; j++) union_map[j] \|= map->at(j);

	2999 i++;

	3000 }

	3001 intptr_t frequency = 0;

	3002 for (intptr_t j = 0; j < kSize; j++) {

	3003 if (union_map[j]) {

	3004 // Add 1 to the frequency to give a small per-character boost for

	3005 // the cases where our sampling is not good enough and many

	3006 // characters have a frequency of zero. This means the frequency

	3007 // can theoretically be up to 2*kSize though we treat it mostly as

	3008 // a fraction of kSize.

	3009 frequency += compiler_->frequency_collator()->Frequency(j) + 1;

	3010 }

	3011 }

	3012 // We use the probability of skipping times the distance we are skipping to

	3013 // judge the effectiveness of this. Actually we have a cut-off: By

	3014 // dividing by 2 we switch off the skipping if the probability of skipping

	3015 // is less than 50%. This is because the multibyte mask-and-compare

	3016 // skipping in quickcheck is more likely to do well on this case.

	3017 bool in_quickcheck_range = ((i - remembered_from < 4) \|\|

	3018 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));

	3019 // Called 'probability' but it is only a rough estimate and can actually

	3020 // be outside the 0-kSize range.

	3021 intptr_t probability =

	3022 (in_quickcheck_range ? kSize / 2 : kSize) - frequency;

	3023 intptr_t points = (i - remembered_from) * probability;

	3024 if (points > biggest_points) {

	3025 *from = remembered_from;

	3026 *to = i - 1;

	3027 biggest_points = points;

	3028 }

	3029 }

	3030 return biggest_points;

	3031 }

	3032

	3033

	3034 // Take all the characters that will not prevent a successful match if they

	3035 // occur in the subject string in the range between min_lookahead and

	3036 // max_lookahead (inclusive) measured from the current position. If the

	3037 // character at max_lookahead offset is not one of these characters, then we

	3038 // can safely skip forwards by the number of characters in the range.

	3039 intptr_t BoyerMooreLookahead::GetSkipTable(

	3040 intptr_t min_lookahead,

	3041 intptr_t max_lookahead,

	3042 const TypedData& boolean_skip_table) {

	3043 const intptr_t kSize = RegExpMacroAssembler::kTableSize;

	3044

	3045 const intptr_t kSkipArrayEntry = 0;

	3046 const intptr_t kDontSkipArrayEntry = 1;

	3047

	3048 for (intptr_t i = 0; i < kSize; i++) {

	3049 boolean_skip_table.SetUint8(i, kSkipArrayEntry);

	3050 }

	3051 intptr_t skip = max_lookahead + 1 - min_lookahead;

	3052

	3053 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {

	3054 BoyerMoorePositionInfo* map = bitmaps_->At(i);

	3055 for (intptr_t j = 0; j < kSize; j++) {

	3056 if (map->at(j)) {

	3057 boolean_skip_table.SetUint8(j, kDontSkipArrayEntry);

	3058 }

	3059 }

	3060 }

	3061

	3062 return skip;

	3063 }

	3064

	3065

	3066 // See comment above on the implementation of GetSkipTable.

	3067 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {

	3068 const intptr_t kSize = RegExpMacroAssembler::kTableSize;

	3069

	3070 intptr_t min_lookahead = 0;

	3071 intptr_t max_lookahead = 0;

	3072

	3073 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;

	3074

	3075 bool found_single_character = false;

	3076 intptr_t single_character = 0;

	3077 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {

	3078 BoyerMoorePositionInfo* map = bitmaps_->At(i);

	3079 if (map->map_count() > 1 \|\|

	3080 (found_single_character && map->map_count() != 0)) {

	3081 found_single_character = false;

	3082 break;

	3083 }

	3084 for (intptr_t j = 0; j < kSize; j++) {

	3085 if (map->at(j)) {

	3086 found_single_character = true;

	3087 single_character = j;

	3088 break;

	3089 }

	3090 }

	3091 }

	3092

	3093 intptr_t lookahead_width = max_lookahead + 1 - min_lookahead;

	3094

	3095 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {

	3096 // The mask-compare can probably handle this better.

	3097 return;

	3098 }

	3099

	3100 if (found_single_character) {

	3101 BlockLabel cont, again;

	3102 masm->BindBlock(&again);

	3103 masm->LoadCurrentCharacter(max_lookahead, &cont, true);

	3104 if (max_char_ > kSize) {

	3105 masm->CheckCharacterAfterAnd(single_character,

	3106 RegExpMacroAssembler::kTableMask,

	3107 &cont);

	3108 } else {

	3109 masm->CheckCharacter(single_character, &cont);

	3110 }

	3111 masm->AdvanceCurrentPosition(lookahead_width);

	3112 masm->GoTo(&again);

	3113 masm->BindBlock(&cont);

	3114 return;

	3115 }

	3116

	3117 const TypedData& boolean_skip_table = TypedData::ZoneHandle(

	3118 compiler_->isolate(),

	3119 TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));

	3120 intptr_t skip_distance = GetSkipTable(

	3121 min_lookahead, max_lookahead, boolean_skip_table);

	3122 ASSERT(skip_distance != 0);

	3123

	3124 BlockLabel cont, again;

	3125

	3126 masm->BindBlock(&again);

	3127 masm->LoadCurrentCharacter(max_lookahead, &cont, true);

	3128 masm->CheckBitInTable(boolean_skip_table, &cont);

	3129 masm->AdvanceCurrentPosition(skip_distance);

	3130 masm->GoTo(&again);

	3131 masm->BindBlock(&cont);

	3132

	3133 return;

	3134 }

	3135

	3136

	3137 /* Code generation for choice nodes.

	3138 *

	3139 * We generate quick checks that do a mask and compare to eliminate a

	3140 * choice. If the quick check succeeds then it jumps to the continuation to

	3141 * do slow checks and check subsequent nodes. If it fails (the common case)

	3142 * it falls through to the next choice.

	3143 *

	3144 * Here is the desired flow graph. Nodes directly below each other imply

	3145 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative

	3146 * 3 doesn't have a quick check so we have to call the slow check.

	3147 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire

	3148 * regexp continuation is generated directly after the Sn node, up to the

	3149 * next GoTo if we decide to reuse some already generated code. Some

	3150 * nodes expect preload_characters to be preloaded into the current

	3151 * character register. R nodes do this preloading. Vertices are marked

	3152 * F for failures and S for success (possible success in the case of quick

	3153 * nodes). L, V, < and > are used as arrow heads.

	3154 *

	3155 * ----------> R

	3156 * \|

	3157 * V

	3158 * Q1 -----> S1

	3159 * \| S /

	3160 * F\| /

	3161 * \| F/

	3162 * \| /

	3163 * \| R

	3164 * \| /

	3165 * V L

	3166 * Q2 -----> S2

	3167 * \| S /

	3168 * F\| /

	3169 * \| F/

	3170 * \| /

	3171 * \| R

	3172 * \| /

	3173 * V L

	3174 * S3

	3175 * \|

	3176 * F\|

	3177 * \|

	3178 * R

	3179 * \|

	3180 * backtrack V

	3181 * <----------Q4

	3182 * \ F \|

	3183 * \ \|S

	3184 * \ F V

	3185 * \-----S4

	3186 *

	3187 * For greedy loops we push the current position, then generate the code that

	3188 * eats the input specially in EmitGreedyLoop. The other choice (the

	3189 * continuation) is generated by the normal code in EmitChoices, and steps back

	3190 * in the input to the starting position when it fails to match. The loop code

	3191 * looks like this (U is the unwind code that steps back in the greedy loop).

	3192 *

	3193 * _____

	3194 * / \

	3195 * V \|

	3196 * ----------> S1 \|

	3197 * /\| \|

	3198 * / \|S \|

	3199 * F/ \_____/

	3200 * /

	3201 * \|<-----

	3202 * \| \

	3203 * V \|S

	3204 * Q2 ---> U----->backtrack

	3205 * \| F /

	3206 * S\| /

	3207 * V F /

	3208 * S2--/

	3209 */

	3210

	3211 GreedyLoopState::GreedyLoopState(bool not_at_start) {

	3212 counter_backtrack_trace_.set_backtrack(&label_);

	3213 if (not_at_start) counter_backtrack_trace_.set_at_start(false);

	3214 }

	3215

	3216

	3217 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {

	3218 #ifdef DEBUG

	3219 intptr_t choice_count = alternatives_->length();

	3220 for (intptr_t i = 0; i < choice_count - 1; i++) {

	3221 GuardedAlternative alternative = alternatives_->At(i);

	3222 ZoneGrowableArray<Guard> guards = alternative.guards();

	3223 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();

	3224 for (intptr_t j = 0; j < guard_count; j++) {

	3225 ASSERT(!trace->mentions_reg(guards->At(j)->reg()));

	3226 }

	3227 }

	3228 #endif

	3229 }

	3230

	3231

	3232 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,

	3233 Trace* current_trace,

	3234 PreloadState* state) {

	3235 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {

	3236 // Save some time by looking at most one machine word ahead.

	3237 state->eats_at_least_ =

	3238 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,

	3239 current_trace->at_start() == Trace::FALSE_VALUE);

	3240 }

	3241 state->preload_characters_ =

	3242 CalculatePreloadCharacters(compiler, state->eats_at_least_);

	3243

	3244 state->preload_is_current_ =

	3245 (current_trace->characters_preloaded() == state->preload_characters_);

	3246 state->preload_has_checked_bounds_ = state->preload_is_current_;

	3247 }

	3248

	3249

	3250 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {

	3251 intptr_t choice_count = alternatives_->length();

	3252

	3253 AssertGuardsMentionRegisters(trace);

	3254

	3255 LimitResult limit_result = LimitVersions(compiler, trace);

	3256 if (limit_result == DONE) return;

	3257 ASSERT(limit_result == CONTINUE);

	3258

	3259 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for

	3260 // other choice nodes we only flush if we are out of code size budget.

	3261 if (trace->flush_budget() == 0 && trace->actions() != NULL) {

	3262 trace->Flush(compiler, this);

	3263 return;

	3264 }

	3265

	3266 RecursionCheck rc(compiler);

	3267

	3268 PreloadState preload;

	3269 preload.init();

	3270 GreedyLoopState greedy_loop_state(not_at_start());

	3271

	3272 intptr_t text_length =

	3273 GreedyLoopTextLengthForAlternative(&((*alternatives_)[0]));

	3274 AlternativeGenerationList alt_gens(choice_count);

	3275

	3276 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {

	3277 trace = EmitGreedyLoop(compiler,

	3278 trace,

	3279 &alt_gens,

	3280 &preload,

	3281 &greedy_loop_state,

	3282 text_length);

	3283 } else {

	3284 // TODO(erikcorry): Delete this. We don't need this label, but it makes us

	3285 // match the traces produced pre-cleanup.

	3286 BlockLabel second_choice;

	3287 compiler->macro_assembler()->BindBlock(&second_choice);

	3288

	3289 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);

	3290

	3291 EmitChoices(compiler,

	3292 &alt_gens,

	3293 0,

	3294 trace,

	3295 &preload);

	3296 }

	3297

	3298 // At this point we need to generate slow checks for the alternatives where

	3299 // the quick check was inlined. We can recognize these because the associated

	3300 // label was bound.

	3301 intptr_t new_flush_budget = trace->flush_budget() / choice_count;

	3302 for (intptr_t i = 0; i < choice_count; i++) {

	3303 AlternativeGeneration* alt_gen = alt_gens.at(i);

	3304 Trace new_trace(*trace);

	3305 // If there are actions to be flushed we have to limit how many times

	3306 // they are flushed. Take the budget of the parent trace and distribute

	3307 // it fairly amongst the children.

	3308 if (new_trace.actions() != NULL) {

	3309 new_trace.set_flush_budget(new_flush_budget);

	3310 }

	3311 bool next_expects_preload =

	3312 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;

	3313 EmitOutOfLineContinuation(compiler,

	3314 &new_trace,

	3315 alternatives_->At(i),

	3316 alt_gen,

	3317 preload.preload_characters_,

	3318 next_expects_preload);

	3319 }

	3320 }

	3321

	3322 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,

	3323 Trace* trace,

	3324 AlternativeGenerationList* alt_gens,

	3325 PreloadState* preload,

	3326 GreedyLoopState* greedy_loop_state,

	3327 intptr_t text_length) {

	3328 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	3329 // Here we have special handling for greedy loops containing only text nodes

	3330 // and other simple nodes. These are handled by pushing the current

	3331 // position on the stack and then incrementing the current position each

	3332 // time around the switch. On backtrack we decrement the current position

	3333 // and check it against the pushed value. This avoids pushing backtrack

	3334 // information for each iteration of the loop, which could take up a lot of

	3335 // space.

	3336 ASSERT(trace->stop_node() == NULL);

	3337 macro_assembler->PushCurrentPosition();

	3338 BlockLabel greedy_match_failed;

	3339 Trace greedy_match_trace;

	3340 if (not_at_start()) greedy_match_trace.set_at_start(false);

	3341 greedy_match_trace.set_backtrack(&greedy_match_failed);

	3342 BlockLabel loop_label;

	3343 macro_assembler->BindBlock(&loop_label);

	3344 greedy_match_trace.set_stop_node(this);

	3345 greedy_match_trace.set_loop_label(&loop_label);

	3346 (*alternatives_)[0].node()->Emit(compiler, &greedy_match_trace);

	3347 macro_assembler->BindBlock(&greedy_match_failed);

	3348

	3349 BlockLabel second_choice; // For use in greedy matches.

	3350 macro_assembler->BindBlock(&second_choice);

	3351

	3352 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();

	3353

	3354 EmitChoices(compiler,

	3355 alt_gens,

	3356 1,

	3357 new_trace,

	3358 preload);

	3359

	3360 macro_assembler->BindBlock(greedy_loop_state->label());

	3361 // If we have unwound to the bottom then backtrack.

	3362 macro_assembler->CheckGreedyLoop(trace->backtrack());

	3363 // Otherwise try the second priority at an earlier position.

	3364 macro_assembler->AdvanceCurrentPosition(-text_length);

	3365 macro_assembler->GoTo(&second_choice);

	3366 return new_trace;

	3367 }

	3368

	3369

	3370 intptr_t ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,

	3371 Trace* trace) {

	3372 intptr_t eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;

	3373 if (alternatives_->length() != 2) return eats_at_least;

	3374

	3375 GuardedAlternative alt1 = alternatives_->At(1);

	3376 if (alt1.guards() != NULL && alt1.guards()->length() != 0) {

	3377 return eats_at_least;

	3378 }

	3379 RegExpNode* eats_anything_node = alt1.node();

	3380 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {

	3381 return eats_at_least;

	3382 }

	3383

	3384 // Really we should be creating a new trace when we execute this function,

	3385 // but there is no need, because the code it generates cannot backtrack, and

	3386 // we always arrive here with a trivial trace (since it's the entry to a

	3387 // loop. That also implies that there are no preloaded characters, which is

	3388 // good, because it means we won't be violating any assumptions by

	3389 // overwriting those characters with new load instructions.

	3390 ASSERT(trace->is_trivial());

	3391

	3392 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	3393 // At this point we know that we are at a non-greedy loop that will eat

	3394 // any character one at a time. Any non-anchored regexp has such a

	3395 // loop prepended to it in order to find where it starts. We look for

	3396 // a pattern of the form ...abc... where we can look 6 characters ahead

	3397 // and step forwards 3 if the character is not one of abc. Abc need

	3398 // not be atoms, they can be any reasonably limited character class or

	3399 // small alternation.

	3400 BoyerMooreLookahead* bm = bm_info(false);

	3401 if (bm == NULL) {

	3402 eats_at_least = Utils::Minimum(kMaxLookaheadForBoyerMoore,

	3403 EatsAtLeast(kMaxLookaheadForBoyerMoore,

	3404 kRecursionBudget,

	3405 false));

	3406 if (eats_at_least >= 1) {

	3407 bm = new(I) BoyerMooreLookahead(eats_at_least, compiler, I);

	3408 GuardedAlternative alt0 = alternatives_->At(0);

	3409 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, false);

	3410 }

	3411 }

	3412 if (bm != NULL) {

	3413 bm->EmitSkipInstructions(macro_assembler);

	3414 }

	3415 return eats_at_least;

	3416 }

	3417

	3418

	3419 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,

	3420 AlternativeGenerationList* alt_gens,

	3421 intptr_t first_choice,

	3422 Trace* trace,

	3423 PreloadState* preload) {

	3424 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	3425 SetUpPreLoad(compiler, trace, preload);

	3426

	3427 // For now we just call all choices one after the other. The idea ultimately

	3428 // is to use the Dispatch table to try only the relevant ones.

	3429 intptr_t choice_count = alternatives_->length();

	3430

	3431 intptr_t new_flush_budget = trace->flush_budget() / choice_count;

	3432

	3433 for (intptr_t i = first_choice; i < choice_count; i++) {

	3434 bool is_last = i == choice_count - 1;

	3435 bool fall_through_on_failure = !is_last;

	3436 GuardedAlternative alternative = alternatives_->At(i);

	3437 AlternativeGeneration* alt_gen = alt_gens->at(i);

	3438 alt_gen->quick_check_details.set_characters(preload->preload_characters_);

	3439 ZoneGrowableArray<Guard> guards = alternative.guards();

	3440 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();

	3441 Trace new_trace(*trace);

	3442 new_trace.set_characters_preloaded(preload->preload_is_current_ ?

	3443 preload->preload_characters_ :

	3444 0);

	3445 if (preload->preload_has_checked_bounds_) {

	3446 new_trace.set_bound_checked_up_to(preload->preload_characters_);

	3447 }

	3448 new_trace.quick_check_performed()->Clear();

	3449 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);

	3450 if (!is_last) {

	3451 new_trace.set_backtrack(&alt_gen->after);

	3452 }

	3453 alt_gen->expects_preload = preload->preload_is_current_;

	3454 bool generate_full_check_inline = false;

	3455 if (kRegexpOptimization &&

	3456 try_to_emit_quick_check_for_alternative(i == 0) &&

	3457 alternative.node()->EmitQuickCheck(compiler,

	3458 trace,

	3459 &new_trace,

	3460 preload->preload_has_checked_bounds_,

	3461 &alt_gen->possible_success,

	3462 &alt_gen->quick_check_details,

	3463 fall_through_on_failure)) {

	3464 // Quick check was generated for this choice.

	3465 preload->preload_is_current_ = true;

	3466 preload->preload_has_checked_bounds_ = true;

	3467 // If we generated the quick check to fall through on possible success,

	3468 // we now need to generate the full check inline.

	3469 if (!fall_through_on_failure) {

	3470 macro_assembler->BindBlock(&alt_gen->possible_success);

	3471 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);

	3472 new_trace.set_characters_preloaded(preload->preload_characters_);

	3473 new_trace.set_bound_checked_up_to(preload->preload_characters_);

	3474 generate_full_check_inline = true;

	3475 }

	3476 } else if (alt_gen->quick_check_details.cannot_match()) {

	3477 if (!fall_through_on_failure) {

	3478 macro_assembler->GoTo(trace->backtrack());

	3479 }

	3480 continue;

	3481 } else {

	3482 // No quick check was generated. Put the full code here.

	3483 // If this is not the first choice then there could be slow checks from

	3484 // previous cases that go here when they fail. There's no reason to

	3485 // insist that they preload characters since the slow check we are about

	3486 // to generate probably can't use it.

	3487 if (i != first_choice) {

	3488 alt_gen->expects_preload = false;

	3489 new_trace.InvalidateCurrentCharacter();

	3490 }

	3491 generate_full_check_inline = true;

	3492 }

	3493 if (generate_full_check_inline) {

	3494 if (new_trace.actions() != NULL) {

	3495 new_trace.set_flush_budget(new_flush_budget);

	3496 }

	3497 for (intptr_t j = 0; j < guard_count; j++) {

	3498 GenerateGuard(macro_assembler, guards->At(j), &new_trace);

	3499 }

	3500 alternative.node()->Emit(compiler, &new_trace);

	3501 preload->preload_is_current_ = false;

	3502 }

	3503 macro_assembler->BindBlock(&alt_gen->after);

	3504 }

	3505 }

	3506

	3507

	3508 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,

	3509 Trace* trace,

	3510 GuardedAlternative alternative,

	3511 AlternativeGeneration* alt_gen,

	3512 intptr_t preload_characters,

	3513 bool next_expects_preload) {

	3514 if (!alt_gen->possible_success.IsLinked()) return;

	3515

	3516 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	3517 macro_assembler->BindBlock(&alt_gen->possible_success);

	3518 Trace out_of_line_trace(*trace);

	3519 out_of_line_trace.set_characters_preloaded(preload_characters);

	3520 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);

	3521 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);

	3522 ZoneGrowableArray<Guard> guards = alternative.guards();

	3523 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();

	3524 if (next_expects_preload) {

	3525 BlockLabel reload_current_char;

	3526 out_of_line_trace.set_backtrack(&reload_current_char);

	3527 for (intptr_t j = 0; j < guard_count; j++) {

	3528 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);

	3529 }

	3530 alternative.node()->Emit(compiler, &out_of_line_trace);

	3531 macro_assembler->BindBlock(&reload_current_char);

	3532 // Reload the current character, since the next quick check expects that.

	3533 // We don't need to check bounds here because we only get into this

	3534 // code through a quick check which already did the checked load.

	3535 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),

	3536 NULL,

	3537 false,

	3538 preload_characters);

	3539 macro_assembler->GoTo(&(alt_gen->after));

	3540 } else {

	3541 out_of_line_trace.set_backtrack(&(alt_gen->after));

	3542 for (intptr_t j = 0; j < guard_count; j++) {

	3543 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);

	3544 }

	3545 alternative.node()->Emit(compiler, &out_of_line_trace);

	3546 }

	3547 }

	3548

	3549

	3550 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {

	3551 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	3552 LimitResult limit_result = LimitVersions(compiler, trace);

	3553 if (limit_result == DONE) return;

	3554 ASSERT(limit_result == CONTINUE);

	3555

	3556 RecursionCheck rc(compiler);

	3557

	3558 switch (action_type_) {

	3559 case STORE_POSITION: {

	3560 Trace::DeferredCapture

	3561 new_capture(data_.u_position_register.reg,

	3562 data_.u_position_register.is_capture,

	3563 trace);

	3564 Trace new_trace = *trace;

	3565 new_trace.add_action(&new_capture);

	3566 on_success()->Emit(compiler, &new_trace);

	3567 break;

	3568 }

	3569 case INCREMENT_REGISTER: {

	3570 Trace::DeferredIncrementRegister

	3571 new_increment(data_.u_increment_register.reg);

	3572 Trace new_trace = *trace;

	3573 new_trace.add_action(&new_increment);

	3574 on_success()->Emit(compiler, &new_trace);

	3575 break;

	3576 }

	3577 case SET_REGISTER: {

	3578 Trace::DeferredSetRegister

	3579 new_set(data_.u_store_register.reg, data_.u_store_register.value);

	3580 Trace new_trace = *trace;

	3581 new_trace.add_action(&new_set);

	3582 on_success()->Emit(compiler, &new_trace);

	3583 break;

	3584 }

	3585 case CLEAR_CAPTURES: {

	3586 Trace::DeferredClearCaptures

	3587 new_capture(Interval(data_.u_clear_captures.range_from,

	3588 data_.u_clear_captures.range_to));

	3589 Trace new_trace = *trace;

	3590 new_trace.add_action(&new_capture);

	3591 on_success()->Emit(compiler, &new_trace);

	3592 break;

	3593 }

	3594 case BEGIN_SUBMATCH:

	3595 if (!trace->is_trivial()) {

	3596 trace->Flush(compiler, this);

	3597 } else {

	3598 assembler->WriteCurrentPositionToRegister(

	3599 data_.u_submatch.current_position_register, 0);

	3600 assembler->WriteStackPointerToRegister(

	3601 data_.u_submatch.stack_pointer_register);

	3602 on_success()->Emit(compiler, trace);

	3603 }

	3604 break;

	3605 case EMPTY_MATCH_CHECK: {

	3606 intptr_t start_pos_reg = data_.u_empty_match_check.start_register;

	3607 intptr_t stored_pos = 0;

	3608 intptr_t rep_reg = data_.u_empty_match_check.repetition_register;

	3609 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);

	3610 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);

	3611 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {

	3612 // If we know we haven't advanced and there is no minimum we

	3613 // can just backtrack immediately.

	3614 assembler->GoTo(trace->backtrack());

	3615 } else if (know_dist && stored_pos < trace->cp_offset()) {

	3616 // If we know we've advanced we can generate the continuation

	3617 // immediately.

	3618 on_success()->Emit(compiler, trace);

	3619 } else if (!trace->is_trivial()) {

	3620 trace->Flush(compiler, this);

	3621 } else {

	3622 BlockLabel skip_empty_check;

	3623 // If we have a minimum number of repetitions we check the current

	3624 // number first and skip the empty check if it's not enough.

	3625 if (has_minimum) {

	3626 intptr_t limit = data_.u_empty_match_check.repetition_limit;

	3627 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);

	3628 }

	3629 // If the match is empty we bail out, otherwise we fall through

	3630 // to the on-success continuation.

	3631 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,

	3632 trace->backtrack());

	3633 assembler->BindBlock(&skip_empty_check);

	3634 on_success()->Emit(compiler, trace);

	3635 }

	3636 break;

	3637 }

	3638 case POSITIVE_SUBMATCH_SUCCESS: {

	3639 if (!trace->is_trivial()) {

	3640 trace->Flush(compiler, this);

	3641 return;

	3642 }

	3643 assembler->ReadCurrentPositionFromRegister(

	3644 data_.u_submatch.current_position_register);

	3645 assembler->ReadStackPointerFromRegister(

	3646 data_.u_submatch.stack_pointer_register);

	3647 intptr_t clear_register_count = data_.u_submatch.clear_register_count;

	3648 if (clear_register_count == 0) {

	3649 on_success()->Emit(compiler, trace);

	3650 return;

	3651 }

	3652 intptr_t clear_registers_from = data_.u_submatch.clear_register_from;

	3653 BlockLabel clear_registers_backtrack;

	3654 Trace new_trace = *trace;

	3655 new_trace.set_backtrack(&clear_registers_backtrack);

	3656 on_success()->Emit(compiler, &new_trace);

	3657

	3658 assembler->BindBlock(&clear_registers_backtrack);

	3659 intptr_t clear_registers_to =

	3660 clear_registers_from + clear_register_count - 1;

	3661 assembler->ClearRegisters(clear_registers_from, clear_registers_to);

	3662

	3663 ASSERT(trace->backtrack() == NULL);

	3664 assembler->Backtrack();

	3665 return;

	3666 }

	3667 default:

	3668 UNREACHABLE();

	3669 }

	3670 }

	3671

	3672

	3673 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {

	3674 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	3675 if (!trace->is_trivial()) {

	3676 trace->Flush(compiler, this);

	3677 return;

	3678 }

	3679

	3680 LimitResult limit_result = LimitVersions(compiler, trace);

	3681 if (limit_result == DONE) return;

	3682 ASSERT(limit_result == CONTINUE);

	3683

	3684 RecursionCheck rc(compiler);

	3685

	3686 ASSERT(start_reg_ + 1 == end_reg_);

	3687 if (compiler->ignore_case()) {

	3688 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,

	3689 trace->backtrack());

	3690 } else {

	3691 assembler->CheckNotBackReference(start_reg_, trace->backtrack());

	3692 }

	3693 on_success()->Emit(compiler, trace);

	3694 }

	3695

	3696

	3697 // -------------------------------------------------------------------

	3698 // Dot/dotty output

	3699

	3700

	3701 #ifdef DEBUG

	3702

	3703

	3704 class DotPrinter: public NodeVisitor {

	3705 public:

	3706 explicit DotPrinter(bool ignore_case)

	3707 : ignore_case_(ignore_case) {}

	3708 void PrintNode(const char* label, RegExpNode* node);

	3709 void Visit(RegExpNode* node);

	3710 void PrintAttributes(RegExpNode* from);

	3711 void PrintOnFailure(RegExpNode* from, RegExpNode* to);

	3712 #define DECLARE_VISIT(Type) \

	3713 virtual void Visit##Type(Type##Node* that);

	3714 FOR_EACH_NODE_TYPE(DECLARE_VISIT)

	3715 #undef DECLARE_VISIT

	3716 private:

	3717 bool ignore_case_;

	3718 };

	3719

	3720

	3721 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {

	3722 OS::Print("digraph G {\n graph [label=\"");

	3723 for (intptr_t i = 0; label[i]; i++) {

	3724 switch (label[i]) {

	3725 case '\\':

	3726 OS::Print("\\\\");

	3727 break;

	3728 case '"':

	3729 OS::Print("\"");

	3730 break;

	3731 default:

	3732 OS::Print("%c", label[i]);

	3733 break;

	3734 }

	3735 }

	3736 OS::Print("\"];\n");

	3737 Visit(node);

	3738 OS::Print("}\n");

	3739 }

	3740

	3741

	3742 void DotPrinter::Visit(RegExpNode* node) {

	3743 if (node->info()->visited) return;

	3744 node->info()->visited = true;

	3745 node->Accept(this);

	3746 }

	3747

	3748

	3749 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {

	3750 OS::Print(" n%p -> n%p [style=dotted];\n", from, on_failure);

	3751 Visit(on_failure);

	3752 }

	3753

	3754

	3755 class AttributePrinter : public ValueObject {

	3756 public:

	3757 AttributePrinter() : first_(true) {}

	3758 void PrintSeparator() {

	3759 if (first_) {

	3760 first_ = false;

	3761 } else {

	3762 OS::Print("\|");

	3763 }

	3764 }

	3765 void PrintBit(const char* name, bool value) {

	3766 if (!value) return;

	3767 PrintSeparator();

	3768 OS::Print("{%s}", name);

	3769 }

	3770 void PrintPositive(const char* name, intptr_t value) {

	3771 if (value < 0) return;

	3772 PrintSeparator();

	3773 OS::Print("{%s\|%" Pd "}", name, value);

	3774 }

	3775

	3776 private:

	3777 bool first_;

	3778 };

	3779

	3780

	3781 void DotPrinter::PrintAttributes(RegExpNode* that) {

	3782 OS::Print(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "

	3783 "margin=0.1, fontsize=10, label=\"{", that);

	3784 AttributePrinter printer;

	3785 NodeInfo* info = that->info();

	3786 printer.PrintBit("NI", info->follows_newline_interest);

	3787 printer.PrintBit("WI", info->follows_word_interest);

	3788 printer.PrintBit("SI", info->follows_start_interest);

	3789 BlockLabel* label = that->label();

	3790 if (label->IsBound())

	3791 printer.PrintPositive("@", label->Position());

	3792 OS::Print("}\"];\n"

	3793 " a%p -> n%p [style=dashed, color=grey, arrowhead=none];\n",

	3794 that, that);

	3795 }

	3796

	3797

	3798 void DotPrinter::VisitChoice(ChoiceNode* that) {

	3799 OS::Print(" n%p [shape=Mrecord, label=\"?\"];\n", that);

	3800 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {

	3801 GuardedAlternative alt = that->alternatives()->At(i);

	3802 OS::Print(" n%p -> n%p", that, alt.node());

	3803 }

	3804 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {

	3805 GuardedAlternative alt = that->alternatives()->At(i);

	3806 alt.node()->Accept(this);

	3807 }

	3808 }

	3809

	3810

	3811 void DotPrinter::VisitText(TextNode* that) {

	3812 OS::Print(" n%p [label=\"", that);

	3813 for (intptr_t i = 0; i < that->elements()->length(); i++) {

	3814 if (i > 0) OS::Print(" ");

	3815 TextElement elm = that->elements()->At(i);

	3816 switch (elm.text_type()) {

	3817 case TextElement::ATOM: {

	3818 ZoneGrowableArray<uint16_t>* data = elm.atom()->data();

	3819 for (intptr_t i = 0; i < data->length(); i++) {

	3820 OS::Print("%c", static_cast<char>(data->At(i)));

	3821 }

	3822 break;

	3823 }

	3824 case TextElement::CHAR_CLASS: {

	3825 RegExpCharacterClass* node = elm.char_class();

	3826 OS::Print("[");

	3827 if (node->is_negated()) OS::Print("^");

	3828 for (intptr_t j = 0; j < node->ranges()->length(); j++) {

	3829 CharacterRange range = node->ranges()->At(j);

	3830 PrintUtf16(range.from());

	3831 OS::Print("-");

	3832 PrintUtf16(range.to());

	3833 }

	3834 OS::Print("]");

	3835 break;

	3836 }

	3837 default:

	3838 UNREACHABLE();

	3839 }

	3840 }

	3841 OS::Print("\", shape=box, peripheries=2];\n");

	3842 PrintAttributes(that);

	3843 OS::Print(" n%p -> n%p;\n", that, that->on_success());

	3844 Visit(that->on_success());

	3845 }

	3846

	3847

	3848 void DotPrinter::VisitBackReference(BackReferenceNode* that) {

	3849 OS::Print(" n%p [label=\"$%" Pd "..$%" Pd "\", shape=doubleoctagon];\n",

	3850 that, that->start_register(), that->end_register());

	3851 PrintAttributes(that);

	3852 OS::Print(" n%p -> n%p;\n", that, that->on_success());

	3853 Visit(that->on_success());

	3854 }

	3855

	3856

	3857 void DotPrinter::VisitEnd(EndNode* that) {

	3858 OS::Print(" n%p [style=bold, shape=point];\n", that);

	3859 PrintAttributes(that);

	3860 }

	3861

	3862

	3863 void DotPrinter::VisitAssertion(AssertionNode* that) {

	3864 OS::Print(" n%p [", that);

	3865 switch (that->assertion_type()) {

	3866 case AssertionNode::AT_END:

	3867 OS::Print("label=\"$\", shape=septagon");

	3868 break;

	3869 case AssertionNode::AT_START:

	3870 OS::Print("label=\"^\", shape=septagon");

	3871 break;

	3872 case AssertionNode::AT_BOUNDARY:

	3873 OS::Print("label=\"\\b\", shape=septagon");

	3874 break;

	3875 case AssertionNode::AT_NON_BOUNDARY:

	3876 OS::Print("label=\"\\B\", shape=septagon");

	3877 break;

	3878 case AssertionNode::AFTER_NEWLINE:

	3879 OS::Print("label=\"(?<=\\n)\", shape=septagon");

	3880 break;

	3881 }

	3882 OS::Print("];\n");

	3883 PrintAttributes(that);

	3884 RegExpNode* successor = that->on_success();

	3885 OS::Print(" n%p -> n%p;\n", that, successor);

	3886 Visit(successor);

	3887 }

	3888

	3889

	3890 void DotPrinter::VisitAction(ActionNode* that) {

	3891 OS::Print(" n%p [", that);

	3892 switch (that->action_type_) {

	3893 case ActionNode::SET_REGISTER:

	3894 OS::Print("label=\"$%" Pd ":=%" Pd "\", shape=octagon",

	3895 that->data_.u_store_register.reg,

	3896 that->data_.u_store_register.value);

	3897 break;

	3898 case ActionNode::INCREMENT_REGISTER:

	3899 OS::Print("label=\"$%" Pd "++\", shape=octagon",

	3900 that->data_.u_increment_register.reg);

	3901 break;

	3902 case ActionNode::STORE_POSITION:

	3903 OS::Print("label=\"$%" Pd ":=$pos\", shape=octagon",

	3904 that->data_.u_position_register.reg);

	3905 break;

	3906 case ActionNode::BEGIN_SUBMATCH:

	3907 OS::Print("label=\"$%" Pd ":=$pos,begin\", shape=septagon",

	3908 that->data_.u_submatch.current_position_register);

	3909 break;

	3910 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:

	3911 OS::Print("label=\"escape\", shape=septagon");

	3912 break;

	3913 case ActionNode::EMPTY_MATCH_CHECK:

	3914 OS::Print("label=\"$%" Pd "=$pos?,$%" Pd "<%" Pd "?\", shape=septagon",

	3915 that->data_.u_empty_match_check.start_register,

	3916 that->data_.u_empty_match_check.repetition_register,

	3917 that->data_.u_empty_match_check.repetition_limit);

	3918 break;

	3919 case ActionNode::CLEAR_CAPTURES: {

	3920 OS::Print("label=\"clear $%" Pd " to $%" Pd "\", shape=septagon",

	3921 that->data_.u_clear_captures.range_from,

	3922 that->data_.u_clear_captures.range_to);

	3923 break;

	3924 }

	3925 }

	3926 OS::Print("];\n");

	3927 PrintAttributes(that);

	3928 RegExpNode* successor = that->on_success();

	3929 OS::Print(" n%p -> n%p;\n", that, successor);

	3930 Visit(successor);

	3931 }

	3932

	3933

	3934 void RegExpEngine::DotPrint(const char* label,

	3935 RegExpNode* node,

	3936 bool ignore_case) {

	3937 DotPrinter printer(ignore_case);

	3938 printer.PrintNode(label, node);

	3939 }

	3940

	3941

	3942 #endif // DEBUG

	3943

	3944

	3945 // -------------------------------------------------------------------

	3946 // Tree to graph conversion

	3947

	3948 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,

	3949 RegExpNode* on_success) {

	3950 ZoneGrowableArray<TextElement>* elms =

	3951 new(CI) ZoneGrowableArray<TextElement>(1);

	3952 elms->Add(TextElement::Atom(this));

	3953 return new(CI) TextNode(elms, on_success);

	3954 }

	3955

	3956

	3957 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,

	3958 RegExpNode* on_success) {

	3959 ZoneGrowableArray<TextElement>* elms =

	3960 new(CI) ZoneGrowableArray<TextElement>(1);

	3961 for (intptr_t i = 0; i < elements()->length(); i++) {

	3962 elms->Add(elements()->At(i));

	3963 }

	3964 return new(CI) TextNode(elms, on_success);

	3965 }

	3966

	3967

	3968 static bool CompareInverseRanges(ZoneGrowableArray<CharacterRange>* ranges,

	3969 const intptr_t* special_class,

	3970 intptr_t length) {

	3971 length--; // Remove final 0x10000.

	3972 ASSERT(special_class[length] == 0x10000);

	3973 ASSERT(ranges->length() != 0);

	3974 ASSERT(length != 0);

	3975 ASSERT(special_class[0] != 0);

	3976 if (ranges->length() != (length >> 1) + 1) {

	3977 return false;

	3978 }

	3979 CharacterRange range = ranges->At(0);

	3980 if (range.from() != 0) {

	3981 return false;

	3982 }

	3983 for (intptr_t i = 0; i < length; i += 2) {

	3984 if (special_class[i] != (range.to() + 1)) {

	3985 return false;

	3986 }

	3987 range = ranges->At((i >> 1) + 1);

	3988 if (special_class[i+1] != range.from()) {

	3989 return false;

	3990 }

	3991 }

	3992 if (range.to() != 0xffff) {

	3993 return false;

	3994 }

	3995 return true;

	3996 }

	3997

	3998

	3999 static bool CompareRanges(ZoneGrowableArray<CharacterRange>* ranges,

	4000 const intptr_t* special_class,

	4001 intptr_t length) {

	4002 length--; // Remove final 0x10000.

	4003 ASSERT(special_class[length] == 0x10000);

	4004 if (ranges->length() * 2 != length) {

	4005 return false;

	4006 }

	4007 for (intptr_t i = 0; i < length; i += 2) {

	4008 CharacterRange range = ranges->At(i >> 1);

	4009 if (range.from() != special_class[i] \|\|

	4010 range.to() != special_class[i + 1] - 1) {

	4011 return false;

	4012 }

	4013 }

	4014 return true;

	4015 }

	4016

	4017

	4018 bool RegExpCharacterClass::is_standard() {

	4019 // TODO(lrn): Remove need for this function, by not throwing away information

	4020 // along the way.

	4021 if (is_negated_) {

	4022 return false;

	4023 }

	4024 if (set_.is_standard()) {

	4025 return true;

	4026 }

	4027 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {

	4028 set_.set_standard_set_type('s');

	4029 return true;

	4030 }

	4031 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {

	4032 set_.set_standard_set_type('S');

	4033 return true;

	4034 }

	4035 if (CompareInverseRanges(set_.ranges(),

	4036 kLineTerminatorRanges,

	4037 kLineTerminatorRangeCount)) {

	4038 set_.set_standard_set_type('.');

	4039 return true;

	4040 }

	4041 if (CompareRanges(set_.ranges(),

	4042 kLineTerminatorRanges,

	4043 kLineTerminatorRangeCount)) {

	4044 set_.set_standard_set_type('n');

	4045 return true;

	4046 }

	4047 if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {

	4048 set_.set_standard_set_type('w');

	4049 return true;

	4050 }

	4051 if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {

	4052 set_.set_standard_set_type('W');

	4053 return true;

	4054 }

	4055 return false;

	4056 }

	4057

	4058

	4059 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,

	4060 RegExpNode* on_success) {

	4061 return new(CI) TextNode(this, on_success);

	4062 }

	4063

	4064

	4065 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,

	4066 RegExpNode* on_success) {

	4067 ZoneGrowableArray<RegExpTree> alternatives = this->alternatives();

	4068 intptr_t length = alternatives->length();

	4069 ChoiceNode* result =

	4070 new(CI) ChoiceNode(length, CI);

	4071 for (intptr_t i = 0; i < length; i++) {

	4072 GuardedAlternative alternative(alternatives->At(i)->ToNode(compiler,

	4073 on_success));

	4074 result->AddAlternative(alternative);

	4075 }

	4076 return result;

	4077 }

	4078

	4079

	4080 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,

	4081 RegExpNode* on_success) {

	4082 return ToNode(min(),

	4083 max(),

	4084 is_greedy(),

	4085 body(),

	4086 compiler,

	4087 on_success);

	4088 }

	4089

	4090

	4091 // Scoped object to keep track of how much we unroll quantifier loops in the

	4092 // regexp graph generator.

	4093 class RegExpExpansionLimiter : public ValueObject {

	4094 public:

	4095 static const intptr_t kMaxExpansionFactor = 6;

	4096 RegExpExpansionLimiter(RegExpCompiler* compiler, intptr_t factor)

	4097 : compiler_(compiler),

	4098 saved_expansion_factor_(compiler->current_expansion_factor()),

	4099 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {

	4100 ASSERT(factor > 0);

	4101 if (ok_to_expand_) {

	4102 if (factor > kMaxExpansionFactor) {

	4103 // Avoid integer overflow of the current expansion factor.

	4104 ok_to_expand_ = false;

	4105 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);

	4106 } else {

	4107 intptr_t new_factor = saved_expansion_factor_ * factor;

	4108 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);

	4109 compiler->set_current_expansion_factor(new_factor);

	4110 }

	4111 }

	4112 }

	4113

	4114 ~RegExpExpansionLimiter() {

	4115 compiler_->set_current_expansion_factor(saved_expansion_factor_);

	4116 }

	4117

	4118 bool ok_to_expand() { return ok_to_expand_; }

	4119

	4120 private:

	4121 RegExpCompiler* compiler_;

	4122 intptr_t saved_expansion_factor_;

	4123 bool ok_to_expand_;

	4124

	4125 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);

	4126 };

	4127

	4128

	4129 RegExpNode* RegExpQuantifier::ToNode(intptr_t min,

	4130 intptr_t max,

	4131 bool is_greedy,

	4132 RegExpTree* body,

	4133 RegExpCompiler* compiler,

	4134 RegExpNode* on_success,

	4135 bool not_at_start) {

	4136 // x{f, t} becomes this:

	4137 //

	4138 // (r++)<-.

	4139 // \| `

	4140 // \| (x)

	4141 // v ^

	4142 // (r=0)-->(?)---/ [if r < t]

	4143 // \|

	4144 // [if r >= f] \----> ...

	4145 //

	4146

	4147 // 15.10.2.5 RepeatMatcher algorithm.

	4148 // The parser has already eliminated the case where max is 0. In the case

	4149 // where max_match is zero the parser has removed the quantifier if min was

	4150 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.

	4151

	4152 // If we know that we cannot match zero length then things are a little

	4153 // simpler since we don't need to make the special zero length match check

	4154 // from step 2.1. If the min and max are small we can unroll a little in

	4155 // this case.

	4156 // Unroll (foo)+ and (foo){3,}

	4157 static const intptr_t kMaxUnrolledMinMatches = 3;

	4158 // Unroll (foo)? and (foo){x,3}

	4159 static const intptr_t kMaxUnrolledMaxMatches = 3;

	4160 if (max == 0) return on_success; // This can happen due to recursion.

	4161 bool body_can_be_empty = (body->min_match() == 0);

	4162 intptr_t body_start_reg = RegExpCompiler::kNoRegister;

	4163 Interval capture_registers = body->CaptureRegisters();

	4164 bool needs_capture_clearing = !capture_registers.is_empty();

	4165 Isolate* isolate = compiler->isolate();

	4166

	4167 if (body_can_be_empty) {

	4168 body_start_reg = compiler->AllocateRegister();

	4169 } else if (kRegexpOptimization && !needs_capture_clearing) {

	4170 // Only unroll if there are no captures and the body can't be

	4171 // empty.

	4172 {

	4173 RegExpExpansionLimiter limiter(

	4174 compiler, min + ((max != min) ? 1 : 0));

	4175 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {

	4176 intptr_t new_max = (max == kInfinity) ? max : max - min;

	4177 // Recurse once to get the loop or optional matches after the fixed

	4178 // ones.

	4179 RegExpNode* answer = ToNode(

	4180 0, new_max, is_greedy, body, compiler, on_success, true);

	4181 // Unroll the forced matches from 0 to min. This can cause chains of

	4182 // TextNodes (which the parser does not generate). These should be

	4183 // combined if it turns out they hinder good code generation.

	4184 for (intptr_t i = 0; i < min; i++) {

	4185 answer = body->ToNode(compiler, answer);

	4186 }

	4187 return answer;

	4188 }

	4189 }

	4190 if (max <= kMaxUnrolledMaxMatches && min == 0) {

	4191 ASSERT(max > 0); // Due to the 'if' above.

	4192 RegExpExpansionLimiter limiter(compiler, max);

	4193 if (limiter.ok_to_expand()) {

	4194 // Unroll the optional matches up to max.

	4195 RegExpNode* answer = on_success;

	4196 for (intptr_t i = 0; i < max; i++) {

	4197 ChoiceNode* alternation = new(isolate) ChoiceNode(2, isolate);

	4198 if (is_greedy) {

	4199 alternation->AddAlternative(

	4200 GuardedAlternative(body->ToNode(compiler, answer)));

	4201 alternation->AddAlternative(GuardedAlternative(on_success));

	4202 } else {

	4203 alternation->AddAlternative(GuardedAlternative(on_success));

	4204 alternation->AddAlternative(

	4205 GuardedAlternative(body->ToNode(compiler, answer)));

	4206 }

	4207 answer = alternation;

	4208 if (not_at_start) alternation->set_not_at_start();

	4209 }

	4210 return answer;

	4211 }

	4212 }

	4213 }

	4214 bool has_min = min > 0;

	4215 bool has_max = max < RegExpTree::kInfinity;

	4216 bool needs_counter = has_min \|\| has_max;

	4217 intptr_t reg_ctr = needs_counter

	4218 ? compiler->AllocateRegister()

	4219 : RegExpCompiler::kNoRegister;

	4220 LoopChoiceNode* center = new(isolate) LoopChoiceNode(body->min_match() == 0,

	4221 isolate);

	4222 if (not_at_start) center->set_not_at_start();

	4223 RegExpNode* loop_return = needs_counter

	4224 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))

	4225 : static_cast<RegExpNode*>(center);

	4226 if (body_can_be_empty) {

	4227 // If the body can be empty we need to check if it was and then

	4228 // backtrack.

	4229 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,

	4230 reg_ctr,

	4231 min,

	4232 loop_return);

	4233 }

	4234 RegExpNode* body_node = body->ToNode(compiler, loop_return);

	4235 if (body_can_be_empty) {

	4236 // If the body can be empty we need to store the start position

	4237 // so we can bail out if it was empty.

	4238 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);

	4239 }

	4240 if (needs_capture_clearing) {

	4241 // Before entering the body of this loop we need to clear captures.

	4242 body_node = ActionNode::ClearCaptures(capture_registers, body_node);

	4243 }

	4244 GuardedAlternative body_alt(body_node);

	4245 if (has_max) {

	4246 Guard* body_guard =

	4247 new(isolate) Guard(reg_ctr, Guard::LT, max);

	4248 body_alt.AddGuard(body_guard, isolate);

	4249 }

	4250 GuardedAlternative rest_alt(on_success);

	4251 if (has_min) {

	4252 Guard* rest_guard = new(isolate) Guard(reg_ctr, Guard::GEQ, min);

	4253 rest_alt.AddGuard(rest_guard, isolate);

	4254 }

	4255 if (is_greedy) {

	4256 center->AddLoopAlternative(body_alt);

	4257 center->AddContinueAlternative(rest_alt);

	4258 } else {

	4259 center->AddContinueAlternative(rest_alt);

	4260 center->AddLoopAlternative(body_alt);

	4261 }

	4262 if (needs_counter) {

	4263 return ActionNode::SetRegister(reg_ctr, 0, center);

	4264 } else {

	4265 return center;

	4266 }

	4267 }

	4268

	4269

	4270 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,

	4271 RegExpNode* on_success) {

	4272 switch (assertion_type()) {

	4273 case START_OF_LINE:

	4274 return AssertionNode::AfterNewline(on_success);

	4275 case START_OF_INPUT:

	4276 return AssertionNode::AtStart(on_success);

	4277 case BOUNDARY:

	4278 return AssertionNode::AtBoundary(on_success);

	4279 case NON_BOUNDARY:

	4280 return AssertionNode::AtNonBoundary(on_success);

	4281 case END_OF_INPUT:

	4282 return AssertionNode::AtEnd(on_success);

	4283 case END_OF_LINE: {

	4284 // Compile $ in multiline regexps as an alternation with a positive

	4285 // lookahead in one side and an end-of-input on the other side.

	4286 // We need two registers for the lookahead.

	4287 intptr_t stack_pointer_register = compiler->AllocateRegister();

	4288 intptr_t position_register = compiler->AllocateRegister();

	4289 // The ChoiceNode to distinguish between a newline and end-of-input.

	4290 ChoiceNode* result = new ChoiceNode(2, on_success->isolate());

	4291 // Create a newline atom.

	4292 ZoneGrowableArray<CharacterRange>* newline_ranges =

	4293 new ZoneGrowableArray<CharacterRange>(3);

	4294 CharacterRange::AddClassEscape('n', newline_ranges);

	4295 RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');

	4296 TextNode* newline_matcher = new TextNode(

	4297 newline_atom,

	4298 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,

	4299 position_register,

	4300 0, // No captures inside.

	4301 -1, // Ignored if no captures.

	4302 on_success));

	4303 // Create an end-of-input matcher.

	4304 RegExpNode* end_of_line = ActionNode::BeginSubmatch(

	4305 stack_pointer_register,

	4306 position_register,

	4307 newline_matcher);

	4308 // Add the two alternatives to the ChoiceNode.

	4309 GuardedAlternative eol_alternative(end_of_line);

	4310 result->AddAlternative(eol_alternative);

	4311 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));

	4312 result->AddAlternative(end_alternative);

	4313 return result;

	4314 }

	4315 default:

	4316 UNREACHABLE();

	4317 }

	4318 return on_success;

	4319 }

	4320

	4321

	4322 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,

	4323 RegExpNode* on_success) {

	4324 return new(CI)

	4325 BackReferenceNode(RegExpCapture::StartRegister(index()),

	4326 RegExpCapture::EndRegister(index()),

	4327 on_success);

	4328 }

	4329

	4330

	4331 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,

	4332 RegExpNode* on_success) {

	4333 return on_success;

	4334 }

	4335

	4336

	4337 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,

	4338 RegExpNode* on_success) {

	4339 intptr_t stack_pointer_register = compiler->AllocateRegister();

	4340 intptr_t position_register = compiler->AllocateRegister();

	4341

	4342 const intptr_t registers_per_capture = 2;

	4343 const intptr_t register_of_first_capture = 2;

	4344 intptr_t register_count = capture_count_ * registers_per_capture;

	4345 intptr_t register_start =

	4346 register_of_first_capture + capture_from_ * registers_per_capture;

	4347

	4348 RegExpNode* success;

	4349 if (is_positive()) {

	4350 RegExpNode* node = ActionNode::BeginSubmatch(

	4351 stack_pointer_register,

	4352 position_register,

	4353 body()->ToNode(

	4354 compiler,

	4355 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,

	4356 position_register,

	4357 register_count,

	4358 register_start,

	4359 on_success)));

	4360 return node;

	4361 } else {

	4362 // We use a ChoiceNode for a negative lookahead because it has most of

	4363 // the characteristics we need. It has the body of the lookahead as its

	4364 // first alternative and the expression after the lookahead of the second

	4365 // alternative. If the first alternative succeeds then the

	4366 // NegativeSubmatchSuccess will unwind the stack including everything the

	4367 // choice node set up and backtrack. If the first alternative fails then

	4368 // the second alternative is tried, which is exactly the desired result

	4369 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special

	4370 // ChoiceNode that knows to ignore the first exit when calculating quick

	4371 // checks.

	4372

	4373 GuardedAlternative body_alt(

	4374 body()->ToNode(

	4375 compiler,

	4376 success = new(CI) NegativeSubmatchSuccess(stack_pointer_register,

	4377 position_register,

	4378 register_count,

	4379 register_start,

	4380 CI)));

	4381 ChoiceNode* choice_node =

	4382 new(CI) NegativeLookaheadChoiceNode(body_alt,

	4383 GuardedAlternative(on_success),

	4384 CI);

	4385 return ActionNode::BeginSubmatch(stack_pointer_register,

	4386 position_register,

	4387 choice_node);

	4388 }

	4389 }

	4390

	4391

	4392 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,

	4393 RegExpNode* on_success) {

	4394 return ToNode(body(), index(), compiler, on_success);

	4395 }

	4396

	4397

	4398 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,

	4399 intptr_t index,

	4400 RegExpCompiler* compiler,

	4401 RegExpNode* on_success) {

	4402 intptr_t start_reg = RegExpCapture::StartRegister(index);

	4403 intptr_t end_reg = RegExpCapture::EndRegister(index);

	4404 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);

	4405 RegExpNode* body_node = body->ToNode(compiler, store_end);

	4406 return ActionNode::StorePosition(start_reg, true, body_node);

	4407 }

	4408

	4409

	4410 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,

	4411 RegExpNode* on_success) {

	4412 ZoneGrowableArray<RegExpTree> children = nodes();

	4413 RegExpNode* current = on_success;

	4414 for (intptr_t i = children->length() - 1; i >= 0; i--) {

	4415 current = children->At(i)->ToNode(compiler, current);

	4416 }

	4417 return current;

	4418 }

	4419

	4420

	4421 static void AddClass(const intptr_t* elmv,

	4422 intptr_t elmc,

	4423 ZoneGrowableArray<CharacterRange>* ranges) {

	4424 elmc--;

	4425 ASSERT(elmv[elmc] == 0x10000);

	4426 for (intptr_t i = 0; i < elmc; i += 2) {

	4427 ASSERT(elmv[i] < elmv[i + 1]);

	4428 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1));

	4429 }

	4430 }

	4431

	4432

	4433 static void AddClassNegated(const intptr_t *elmv,

	4434 intptr_t elmc,

	4435 ZoneGrowableArray<CharacterRange>* ranges) {

	4436 elmc--;

	4437 ASSERT(elmv[elmc] == 0x10000);

	4438 ASSERT(elmv[0] != 0x0000);

	4439 ASSERT(elmv[elmc-1] != Utf16::kMaxCodeUnit);

	4440 uint16_t last = 0x0000;

	4441 for (intptr_t i = 0; i < elmc; i += 2) {

	4442 ASSERT(last <= elmv[i] - 1);

	4443 ASSERT(elmv[i] < elmv[i + 1]);

	4444 ranges->Add(CharacterRange(last, elmv[i] - 1));

	4445 last = elmv[i + 1];

	4446 }

	4447 ranges->Add(CharacterRange(last, Utf16::kMaxCodeUnit));

	4448 }

	4449

	4450

	4451 void CharacterRange::AddClassEscape(uint16_t type,

	4452 ZoneGrowableArray<CharacterRange>* ranges) {

	4453 switch (type) {

	4454 case 's':

	4455 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);

	4456 break;

	4457 case 'S':

	4458 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);

	4459 break;

	4460 case 'w':

	4461 AddClass(kWordRanges, kWordRangeCount, ranges);

	4462 break;

	4463 case 'W':

	4464 AddClassNegated(kWordRanges, kWordRangeCount, ranges);

	4465 break;

	4466 case 'd':

	4467 AddClass(kDigitRanges, kDigitRangeCount, ranges);

	4468 break;

	4469 case 'D':

	4470 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);

	4471 break;

	4472 case '.':

	4473 AddClassNegated(kLineTerminatorRanges,

	4474 kLineTerminatorRangeCount,

	4475 ranges);

	4476 break;

	4477 // This is not a character range as defined by the spec but a

	4478 // convenient shorthand for a character class that matches any

	4479 // character.

	4480 case '*':

	4481 ranges->Add(CharacterRange::Everything());

	4482 break;

	4483 // This is the set of characters matched by the $ and ^ symbols

	4484 // in multiline mode.

	4485 case 'n':

	4486 AddClass(kLineTerminatorRanges,

	4487 kLineTerminatorRangeCount,

	4488 ranges);

	4489 break;

	4490 default:

	4491 UNREACHABLE();

	4492 }

	4493 }

	4494

	4495

	4496 void CharacterRange::AddCaseEquivalents(

	4497 ZoneGrowableArray<CharacterRange>* ranges,

	4498 bool is_one_byte,

	4499 Isolate* isolate) {

	4500 uint16_t bottom = from();

	4501 uint16_t top = to();

	4502 if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) {

	4503 if (bottom > Symbols::kMaxOneCharCodeSymbol) return;

	4504 if (top > Symbols::kMaxOneCharCodeSymbol) {

	4505 top = Symbols::kMaxOneCharCodeSymbol;

	4506 }

	4507 }

	4508

	4509 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize;

	4510 unibrow::Mapping<unibrow::CanonicalizationRange> jsregexp_canonrange;

	4511 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	4512 if (top == bottom) {

	4513 // If this is a singleton we just expand the one character.

	4514 intptr_t length = jsregexp_uncanonicalize.get(bottom, '\0', chars); // NOLIN T

	4515 for (intptr_t i = 0; i < length; i++) {

	4516 uint32_t chr = chars[i];

	4517 if (chr != bottom) {

	4518 ranges->Add(CharacterRange::Singleton(chars[i]));

	4519 }

	4520 }

	4521 } else {

	4522 // If this is a range we expand the characters block by block,

	4523 // expanding contiguous subranges (blocks) one at a time.

	4524 // The approach is as follows. For a given start character we

	4525 // look up the remainder of the block that contains it (represented

	4526 // by the end point), for instance we find 'z' if the character

	4527 // is 'c'. A block is characterized by the property

	4528 // that all characters uncanonicalize in the same way, except that

	4529 // each entry in the result is incremented by the distance from the first

	4530 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and

	4531 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].

	4532 // Once we've found the end point we look up its uncanonicalization

	4533 // and produce a range for each element. For instance for [c-f]

	4534 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only

	4535 // add a range if it is not already contained in the input, so [c-f]

	4536 // will be skipped but [C-F] will be added. If this range is not

	4537 // completely contained in a block we do this for all the blocks

	4538 // covered by the range (handling characters that is not in a block

	4539 // as a "singleton block").

	4540 int32_t range[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	4541 intptr_t pos = bottom;

	4542 while (pos <= top) {

	4543 intptr_t length = jsregexp_canonrange.get(pos, '\0', range);

	4544 uint16_t block_end;

	4545 if (length == 0) {

	4546 block_end = pos;

	4547 } else {

	4548 ASSERT(length == 1);

	4549 block_end = range[0];

	4550 }

	4551 intptr_t end = (block_end > top) ? top : block_end;

	4552 length = jsregexp_uncanonicalize.get(block_end, '\0', range); // NOLINT

	4553 for (intptr_t i = 0; i < length; i++) {

	4554 uint32_t c = range[i];

	4555 uint16_t range_from = c - (block_end - pos);

	4556 uint16_t range_to = c - (block_end - end);

	4557 if (!(bottom <= range_from && range_to <= top)) {

	4558 ranges->Add(CharacterRange(range_from, range_to));

	4559 }

	4560 }

	4561 pos = end + 1;

	4562 }

	4563 }

	4564 }

	4565

	4566

	4567 bool CharacterRange::IsCanonical(ZoneGrowableArray<CharacterRange>* ranges) {

	4568 ASSERT(ranges != NULL);

	4569 intptr_t n = ranges->length();

	4570 if (n <= 1) return true;

	4571 intptr_t max = ranges->At(0).to();

	4572 for (intptr_t i = 1; i < n; i++) {

	4573 CharacterRange next_range = ranges->At(i);

	4574 if (next_range.from() <= max + 1) return false;

	4575 max = next_range.to();

	4576 }

	4577 return true;

	4578 }

	4579

	4580

	4581 ZoneGrowableArray<CharacterRange>* CharacterSet::ranges() {

	4582 if (ranges_ == NULL) {

	4583 ranges_ = new ZoneGrowableArray<CharacterRange>(2);

	4584 CharacterRange::AddClassEscape(standard_set_type_, ranges_);

	4585 }

	4586 return ranges_;

	4587 }

	4588

	4589

	4590 // Move a number of elements in a zone array to another position

	4591 // in the same array. Handles overlapping source and target areas.

	4592 static void MoveRanges(ZoneGrowableArray<CharacterRange>* list,

	4593 intptr_t from,

	4594 intptr_t to,

	4595 intptr_t count) {

	4596 // Ranges are potentially overlapping.

	4597 if (from < to) {

	4598 for (intptr_t i = count - 1; i >= 0; i--) {

	4599 (*list)[to + i] = list->At(from + i);

	4600 }

	4601 } else {

	4602 for (intptr_t i = 0; i < count; i++) {

	4603 (*list)[to + i] = list->At(from + i);

	4604 }

	4605 }

	4606 }

	4607

	4608

	4609 static intptr_t InsertRangeInCanonicalList(

	4610 ZoneGrowableArray<CharacterRange>* list,

	4611 intptr_t count,

	4612 CharacterRange insert) {

	4613 // Inserts a range into list[0..count[, which must be sorted

	4614 // by from value and non-overlapping and non-adjacent, using at most

	4615 // list[0..count] for the result. Returns the number of resulting

	4616 // canonicalized ranges. Inserting a range may collapse existing ranges into

	4617 // fewer ranges, so the return value can be anything in the range 1..count+1.

	4618 uint16_t from = insert.from();

	4619 uint16_t to = insert.to();

	4620 intptr_t start_pos = 0;

	4621 intptr_t end_pos = count;

	4622 for (intptr_t i = count - 1; i >= 0; i--) {

	4623 CharacterRange current = list->At(i);

	4624 if (current.from() > to + 1) {

	4625 end_pos = i;

	4626 } else if (current.to() + 1 < from) {

	4627 start_pos = i + 1;

	4628 break;

	4629 }

	4630 }

	4631

	4632 // Inserted range overlaps, or is adjacent to, ranges at positions

	4633 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are

	4634 // not affected by the insertion.

	4635 // If start_pos == end_pos, the range must be inserted before start_pos.

	4636 // if start_pos < end_pos, the entire range from start_pos to end_pos

	4637 // must be merged with the insert range.

	4638

	4639 if (start_pos == end_pos) {

	4640 // Insert between existing ranges at position start_pos.

	4641 if (start_pos < count) {

	4642 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);

	4643 }

	4644 (*list)[start_pos] = insert;

	4645 return count + 1;

	4646 }

	4647 if (start_pos + 1 == end_pos) {

	4648 // Replace single existing range at position start_pos.

	4649 CharacterRange to_replace = list->At(start_pos);

	4650 intptr_t new_from = Utils::Minimum(to_replace.from(), from);

	4651 intptr_t new_to = Utils::Maximum(to_replace.to(), to);

	4652 (*list)[start_pos] = CharacterRange(new_from, new_to);

	4653 return count;

	4654 }

	4655 // Replace a number of existing ranges from start_pos to end_pos - 1.

	4656 // Move the remaining ranges down.

	4657

	4658 intptr_t new_from = Utils::Minimum(list->At(start_pos).from(), from);

	4659 intptr_t new_to = Utils::Maximum(list->At(end_pos - 1).to(), to);

	4660 if (end_pos < count) {

	4661 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);

	4662 }

	4663 (*list)[start_pos] = CharacterRange(new_from, new_to);

	4664 return count - (end_pos - start_pos) + 1;

	4665 }

	4666

	4667

	4668 void CharacterSet::Canonicalize() {

	4669 // Special/default classes are always considered canonical. The result

	4670 // of calling ranges() will be sorted.

	4671 if (ranges_ == NULL) return;

	4672 CharacterRange::Canonicalize(ranges_);

	4673 }

	4674

	4675

	4676 void CharacterRange::Canonicalize(

	4677 ZoneGrowableArray<CharacterRange>* character_ranges) {

	4678 if (character_ranges->length() <= 1) return;

	4679 // Check whether ranges are already canonical (increasing, non-overlapping,

	4680 // non-adjacent).

	4681 intptr_t n = character_ranges->length();

	4682 intptr_t max = character_ranges->At(0).to();

	4683 intptr_t i = 1;

	4684 while (i < n) {

	4685 CharacterRange current = character_ranges->At(i);

	4686 if (current.from() <= max + 1) {

	4687 break;

	4688 }

	4689 max = current.to();

	4690 i++;

	4691 }

	4692 // Canonical until the i'th range. If that's all of them, we are done.

	4693 if (i == n) return;

	4694

	4695 // The ranges at index i and forward are not canonicalized. Make them so by

	4696 // doing the equivalent of insertion sort (inserting each into the previous

	4697 // list, in order).

	4698 // Notice that inserting a range can reduce the number of ranges in the

	4699 // result due to combining of adjacent and overlapping ranges.

	4700 intptr_t read = i; // Range to insert.

	4701 intptr_t num_canonical = i; // Length of canonicalized part of list.

	4702 do {

	4703 num_canonical = InsertRangeInCanonicalList(character_ranges,

	4704 num_canonical,

	4705 character_ranges->At(read));

	4706 read++;

	4707 } while (read < n);

	4708 character_ranges->TruncateTo(num_canonical);

	4709

	4710 ASSERT(CharacterRange::IsCanonical(character_ranges));

	4711 }

	4712

	4713

	4714 void CharacterRange::Negate(ZoneGrowableArray<CharacterRange>* ranges,

	4715 ZoneGrowableArray<CharacterRange>* negated_ranges) {

	4716 ASSERT(CharacterRange::IsCanonical(ranges));

	4717 ASSERT(negated_ranges->length() == 0);

	4718 intptr_t range_count = ranges->length();

	4719 uint16_t from = 0;

	4720 intptr_t i = 0;

	4721 if (range_count > 0 && ranges->At(0).from() == 0) {

	4722 from = ranges->At(0).to();

	4723 i = 1;

	4724 }

	4725 while (i < range_count) {

	4726 CharacterRange range = ranges->At(i);

	4727 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));

	4728 from = range.to();

	4729 i++;

	4730 }

	4731 if (from < Utf16::kMaxCodeUnit) {

	4732 negated_ranges->Add(CharacterRange(from + 1, Utf16::kMaxCodeUnit));

	4733 }

	4734 }

	4735

	4736

	4737 // -------------------------------------------------------------------

	4738 // Splay tree

	4739

	4740

	4741 // Workaround for the fact that ZoneGrowableArray does not have contains().

	4742 static bool ArrayContains(ZoneGrowableArray<unsigned>* array,

	4743 unsigned value) {

	4744 for (intptr_t i = 0; i < array->length(); i++) {

	4745 if (array->At(i) == value) {

	4746 return true;

	4747 }

	4748 }

	4749 return false;

	4750 }

	4751

	4752

	4753 void OutSet::Set(unsigned value, Isolate* isolate) {

	4754 if (value < kFirstLimit) {

	4755 first_ \|= (1 << value);

	4756 } else {

	4757 if (remaining_ == NULL)

	4758 remaining_ = new(isolate) ZoneGrowableArray<unsigned>(1);

	4759

	4760 bool remaining_contains_value = ArrayContains(remaining_, value);

	4761 if (remaining_->is_empty() \|\| !remaining_contains_value) {

	4762 remaining_->Add(value);

	4763 }

	4764 }

	4765 }

	4766

	4767

	4768 bool OutSet::Get(unsigned value) const {

	4769 if (value < kFirstLimit) {

	4770 return (first_ & (1 << value)) != 0;

	4771 } else if (remaining_ == NULL) {

	4772 return false;

	4773 } else {

	4774 return ArrayContains(remaining_, value);

	4775 }

	4776 }

	4777

	4778

	4779 // -------------------------------------------------------------------

	4780 // Analysis

	4781

	4782

	4783 void Analysis::EnsureAnalyzed(RegExpNode* that) {

	4784 if (that->info()->been_analyzed \|\| that->info()->being_analyzed)

	4785 return;

	4786 that->info()->being_analyzed = true;

	4787 that->Accept(this);

	4788 that->info()->being_analyzed = false;

	4789 that->info()->been_analyzed = true;

	4790 }

	4791

	4792

	4793 void Analysis::VisitEnd(EndNode* that) {

	4794 // nothing to do

	4795 }

	4796

	4797

	4798 void TextNode::CalculateOffsets() {

	4799 intptr_t element_count = elements()->length();

	4800 // Set up the offsets of the elements relative to the start. This is a fixed

	4801 // quantity since a TextNode can only contain fixed-width things.

	4802 intptr_t cp_offset = 0;

	4803 for (intptr_t i = 0; i < element_count; i++) {

	4804 TextElement& elm = (*elements())[i];

	4805 elm.set_cp_offset(cp_offset);

	4806 cp_offset += elm.length();

	4807 }

	4808 }

	4809

	4810

	4811 void Analysis::VisitText(TextNode* that) {

	4812 if (ignore_case_) {

	4813 that->MakeCaseIndependent(is_one_byte_);

	4814 }

	4815 EnsureAnalyzed(that->on_success());

	4816 if (!has_failed()) {

	4817 that->CalculateOffsets();

	4818 }

	4819 }

	4820

	4821

	4822 void Analysis::VisitAction(ActionNode* that) {

	4823 RegExpNode* target = that->on_success();

	4824 EnsureAnalyzed(target);

	4825 if (!has_failed()) {

	4826 // If the next node is interested in what it follows then this node

	4827 // has to be interested too so it can pass the information on.

	4828 that->info()->AddFromFollowing(target->info());

	4829 }

	4830 }

	4831

	4832

	4833 void Analysis::VisitChoice(ChoiceNode* that) {

	4834 NodeInfo* info = that->info();

	4835 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {

	4836 RegExpNode* node = (*that->alternatives())[i].node();

	4837 EnsureAnalyzed(node);

	4838 if (has_failed()) return;

	4839 // Anything the following nodes need to know has to be known by

	4840 // this node also, so it can pass it on.

	4841 info->AddFromFollowing(node->info());

	4842 }

	4843 }

	4844

	4845

	4846 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {

	4847 NodeInfo* info = that->info();

	4848 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {

	4849 RegExpNode* node = (*that->alternatives())[i].node();

	4850 if (node != that->loop_node()) {

	4851 EnsureAnalyzed(node);

	4852 if (has_failed()) return;

	4853 info->AddFromFollowing(node->info());

	4854 }

	4855 }

	4856 // Check the loop last since it may need the value of this node

	4857 // to get a correct result.

	4858 EnsureAnalyzed(that->loop_node());

	4859 if (!has_failed()) {

	4860 info->AddFromFollowing(that->loop_node()->info());

	4861 }

	4862 }

	4863

	4864

	4865 void Analysis::VisitBackReference(BackReferenceNode* that) {

	4866 EnsureAnalyzed(that->on_success());

	4867 }

	4868

	4869

	4870 void Analysis::VisitAssertion(AssertionNode* that) {

	4871 EnsureAnalyzed(that->on_success());

	4872 }

	4873

	4874

	4875 void BackReferenceNode::FillInBMInfo(intptr_t offset,

	4876 intptr_t budget,

	4877 BoyerMooreLookahead* bm,

	4878 bool not_at_start) {

	4879 // Working out the set of characters that a backreference can match is too

	4880 // hard, so we just say that any character can match.

	4881 bm->SetRest(offset);

	4882 SaveBMInfo(bm, not_at_start, offset);

	4883 }

	4884

	4885

	4886 COMPILE_ASSERT(BoyerMoorePositionInfo::kMapSize ==

	4887 RegExpMacroAssembler::kTableSize);

	4888

	4889

	4890 void ChoiceNode::FillInBMInfo(intptr_t offset,

	4891 intptr_t budget,

	4892 BoyerMooreLookahead* bm,

	4893 bool not_at_start) {

	4894 ZoneGrowableArray<GuardedAlternative>* alts = alternatives();

	4895 budget = (budget - 1) / alts->length();

	4896 for (intptr_t i = 0; i < alts->length(); i++) {

	4897 GuardedAlternative& alt = (*alts)[i];

	4898 if (alt.guards() != NULL && alt.guards()->length() != 0) {

	4899 bm->SetRest(offset); // Give up trying to fill in info.

	4900 SaveBMInfo(bm, not_at_start, offset);

	4901 return;

	4902 }

	4903 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start);

	4904 }

	4905 SaveBMInfo(bm, not_at_start, offset);

	4906 }

	4907

	4908

	4909 void TextNode::FillInBMInfo(intptr_t initial_offset,

	4910 intptr_t budget,

	4911 BoyerMooreLookahead* bm,

	4912 bool not_at_start) {

	4913 if (initial_offset >= bm->length()) return;

	4914 intptr_t offset = initial_offset;

	4915 intptr_t max_char = bm->max_char();

	4916 for (intptr_t i = 0; i < elements()->length(); i++) {

	4917 if (offset >= bm->length()) {

	4918 if (initial_offset == 0) set_bm_info(not_at_start, bm);

	4919 return;

	4920 }

	4921 TextElement text = elements()->At(i);

	4922 if (text.text_type() == TextElement::ATOM) {

	4923 RegExpAtom* atom = text.atom();

	4924 for (intptr_t j = 0; j < atom->length(); j++, offset++) {

	4925 if (offset >= bm->length()) {

	4926 if (initial_offset == 0) set_bm_info(not_at_start, bm);

	4927 return;

	4928 }

	4929 uint16_t character = atom->data()->At(j);

	4930 if (bm->compiler()->ignore_case()) {

	4931 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	4932 intptr_t length = GetCaseIndependentLetters(

	4933 character,

	4934 bm->max_char() == Symbols::kMaxOneCharCodeSymbol,

	4935 chars);

	4936 for (intptr_t j = 0; j < length; j++) {

	4937 bm->Set(offset, chars[j]);

	4938 }

	4939 } else {

	4940 if (character <= max_char) bm->Set(offset, character);

	4941 }

	4942 }

	4943 } else {

	4944 ASSERT(text.text_type() == TextElement::CHAR_CLASS);

	4945 RegExpCharacterClass* char_class = text.char_class();

	4946 ZoneGrowableArray<CharacterRange>* ranges = char_class->ranges();

	4947 if (char_class->is_negated()) {

	4948 bm->SetAll(offset);

	4949 } else {

	4950 for (intptr_t k = 0; k < ranges->length(); k++) {

	4951 CharacterRange& range = (*ranges)[k];

	4952 if (range.from() > max_char) continue;

	4953 intptr_t to = Utils::Minimum(max_char,

	4954 static_cast<intptr_t>(range.to()));

	4955 bm->SetInterval(offset, Interval(range.from(), to));

	4956 }

	4957 }

	4958 offset++;

	4959 }

	4960 }

	4961 if (offset >= bm->length()) {

	4962 if (initial_offset == 0) set_bm_info(not_at_start, bm);

	4963 return;

	4964 }

	4965 on_success()->FillInBMInfo(offset,

	4966 budget - 1,

	4967 bm,

	4968 true); // Not at start after a text node.

	4969 if (initial_offset == 0) set_bm_info(not_at_start, bm);

	4970 }

	4971

	4972

	4973 RegExpEngine::CompilationResult RegExpEngine::Compile(

	4974 RegExpCompileData* data,

	4975 const ParsedFunction* parsed_function,

	4976 const ZoneGrowableArray<const ICData*>& ic_data_array) {

	4977 Isolate* isolate = Isolate::Current();

	4978

	4979 const Function& function = parsed_function->function();

	4980 const intptr_t specialization_cid = function.regexp_cid();

	4981 const bool is_one_byte = (specialization_cid == kOneByteStringCid \|\|

	4982 specialization_cid == kExternalOneByteStringCid);

	4983 JSRegExp& regexp = JSRegExp::Handle(isolate, function.regexp());

	4984 const String& pattern = String::Handle(isolate, regexp.pattern());

	4985

	4986 ASSERT(!regexp.IsNull());

	4987 ASSERT(!pattern.IsNull());

	4988

	4989 const bool ignore_case = regexp.is_ignore_case();

	4990 const bool is_global = regexp.is_global();

	4991

	4992 RegExpCompiler compiler(data->capture_count, ignore_case, specialization_cid);

	4993

	4994 // TODO(zerny): Frequency sampling is currently disabled because of several

	4995 // issues. We do not want to store subject strings in the regexp object since

	4996 // they might be long and we should not prevent their garbage collection.

	4997 // Passing them to this function explicitly does not help, since we must

	4998 // generate exactly the same IR for both the unoptimizing and optimizing

	4999 // pipelines (otherwise it gets confused when i.e. deopt id's differ).

	5000 // An option would be to store sampling results in the regexp object, but

	5001 // I'm not sure the performance gains are relevant enough.

	5002

	5003 // Wrap the body of the regexp in capture #0.

	5004 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,

	5005 0,

	5006 &compiler,

	5007 compiler.accept());

	5008

	5009 RegExpNode* node = captured_body;

	5010 bool is_end_anchored = data->tree->IsAnchoredAtEnd();

	5011 bool is_start_anchored = data->tree->IsAnchoredAtStart();

	5012 intptr_t max_length = data->tree->max_match();

	5013 if (!is_start_anchored) {

	5014 // Add a .*? at the beginning, outside the body capture, unless

	5015 // this expression is anchored at the beginning.

	5016 RegExpNode* loop_node =

	5017 RegExpQuantifier::ToNode(0,

	5018 RegExpTree::kInfinity,

	5019 false,

	5020 new(isolate) RegExpCharacterClass('*'),

	5021 &compiler,

	5022 captured_body,

	5023 data->contains_anchor);

	5024

	5025 if (data->contains_anchor) {

	5026 // Unroll loop once, to take care of the case that might start

	5027 // at the start of input.

	5028 ChoiceNode* first_step_node = new(isolate) ChoiceNode(2, isolate);

	5029 first_step_node->AddAlternative(GuardedAlternative(captured_body));

	5030 first_step_node->AddAlternative(GuardedAlternative(

	5031 new(isolate) TextNode(

	5032 new(isolate) RegExpCharacterClass('*'), loop_node)));

	5033 node = first_step_node;

	5034 } else {

	5035 node = loop_node;

	5036 }

	5037 }

	5038 if (is_one_byte) {

	5039 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);

	5040 // Do it again to propagate the new nodes to places where they were not

	5041 // put because they had not been calculated yet.

	5042 if (node != NULL) {

	5043 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);

	5044 }

	5045 }

	5046

	5047 if (node == NULL) node = new(isolate) EndNode(EndNode::BACKTRACK, isolate);

	5048 data->node = node;

	5049 Analysis analysis(ignore_case, is_one_byte);

	5050 analysis.EnsureAnalyzed(node);

	5051 if (analysis.has_failed()) {

	5052 const char* error_message = analysis.error_message();

	5053 return CompilationResult(error_message);

	5054 }

	5055

	5056 // Native regexp implementation.

	5057

	5058 IRRegExpMacroAssembler* macro_assembler =

	5059 new(isolate) IRRegExpMacroAssembler(specialization_cid,

	5060 data->capture_count,

	5061 parsed_function,

	5062 ic_data_array,

	5063 isolate);

	5064

	5065 // Inserted here, instead of in Assembler, because it depends on information

	5066 // in the AST that isn't replicated in the Node structure.

	5067 static const intptr_t kMaxBacksearchLimit = 1024;

	5068 if (is_end_anchored &&

	5069 !is_start_anchored &&

	5070 max_length < kMaxBacksearchLimit) {

	5071 macro_assembler->SetCurrentPositionFromEnd(max_length);

	5072 }

	5073

	5074 if (is_global) {

	5075 macro_assembler->set_global_mode(

	5076 (data->tree->min_match() > 0)

	5077 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK

	5078 : RegExpMacroAssembler::GLOBAL);

	5079 }

	5080

	5081 RegExpEngine::CompilationResult result =

	5082 compiler.Assemble(macro_assembler,

	5083 node,

	5084 data->capture_count,

	5085 pattern);

	5086

	5087 if (FLAG_trace_irregexp) {

	5088 macro_assembler->PrintBlocks();

	5089 }

	5090

	5091 return result;

	5092 }

	5093

	5094

	5095 static void CreateSpecializedFunction(Isolate* isolate,

	5096 const JSRegExp& regexp,

	5097 intptr_t specialization_cid,

	5098 const Object& owner) {

	5099 const intptr_t kParamCount = RegExpMacroAssembler::kParamCount;

	5100

	5101 Function& fn = Function::Handle(isolate,

	5102 Function::New(String::Handle(isolate, Symbols::New("RegExp")),

	5103 RawFunction::kIrregexpFunction,

	5104 true, // Static.

	5105 false, // Not const.

	5106 false, // Not abstract.

	5107 false, // Not external.

	5108 false, // Not native.

	5109 owner,

	5110 0)); // Requires a non-negative token position.

	5111

	5112 // TODO(zerny): Share these arrays between all irregexp functions.

	5113 fn.set_num_fixed_parameters(kParamCount);

	5114 fn.set_parameter_types(Array::Handle(isolate, Array::New(kParamCount,

	5115 Heap::kOld)));

	5116 fn.set_parameter_names(Array::Handle(isolate, Array::New(kParamCount,

	5117 Heap::kOld)));

	5118 fn.SetParameterTypeAt(0, Type::Handle(isolate, Type::DynamicType()));

	5119 fn.SetParameterNameAt(0, Symbols::string_param_());

	5120 fn.SetParameterTypeAt(1, Type::Handle(isolate, Type::DynamicType()));

	5121 fn.SetParameterNameAt(1, Symbols::start_index_param_());

	5122 fn.set_result_type(Type::Handle(isolate, Type::ArrayType()));

	5123

	5124 // Cache the result.

	5125 regexp.set_function(specialization_cid, fn);

	5126

	5127 fn.set_regexp(regexp);

	5128 fn.set_regexp_cid(specialization_cid);

	5129

	5130 // The function is compiled lazily during the first call.

	5131 }

	5132

	5133

	5134 RawJSRegExp* RegExpEngine::CreateJSRegExp(Isolate* isolate,

	5135 const String& pattern,

	5136 bool multi_line,

	5137 bool ignore_case) {

	5138 const JSRegExp& regexp = JSRegExp::Handle(JSRegExp::New(0));

	5139

	5140 regexp.set_pattern(pattern);

	5141

	5142 if (multi_line) {

	5143 regexp.set_is_multi_line();

	5144 }

	5145 if (ignore_case) {

	5146 regexp.set_is_ignore_case();

	5147 }

	5148

	5149 // TODO(zerny): We might want to use normal string searching algorithms

	5150 // for simple patterns.

	5151 regexp.set_is_complex();

	5152 regexp.set_is_global(); // All dart regexps are global.

	5153

	5154 const Library& lib = Library::Handle(isolate, Library::CoreLibrary());

	5155 const Class& owner = Class::Handle(isolate,

	5156 lib.LookupClass(String::Handle(isolate, Symbols::New("RegExp"))));

	5157

	5158 CreateSpecializedFunction(isolate, regexp, kOneByteStringCid, owner);

	5159 CreateSpecializedFunction(isolate, regexp, kTwoByteStringCid, owner);

	5160 CreateSpecializedFunction(isolate, regexp, kExternalOneByteStringCid, owner);

	5161 CreateSpecializedFunction(isolate, regexp, kExternalTwoByteStringCid, owner);

	5162

	5163 return regexp.raw();

	5164 }

	5165

	5166

	5167 } // namespace dart

OLD	NEW

« runtime/vm/object.cc ('K') | « runtime/vm/regexp.h ('k') | runtime/vm/regexp_assembler.h » ('j') | no next file with comments »