runtime/vm/regexp.cc - Issue 539153002: Port and integrate the irregexp engine from V8

Side by Side Diff: runtime/vm/regexp.cc

Issue 539153002: Port and integrate the irregexp engine from V8 (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Port remaining V8 regexp tests and fix exposed bugs. Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 #include "vm/regexp.h"	5 #include "vm/regexp.h"

6	6

	7 #include "vm/dart_entry.h"

	8 #include "vm/regexp_assembler.h"

7 #include "vm/regexp_ast.h"	9 #include "vm/regexp_ast.h"

	10 #include "vm/unibrow-inl.h"

8 #include "vm/unicode.h"	11 #include "vm/unicode.h"

9 #include "vm/symbols.h"	12 #include "vm/symbols.h"

10	13

11 #define I isolate()	14 #define I isolate()

12 #define CI compiler->isolate()	15 #define CI compiler->isolate()

13	16

14 namespace dart {	17 namespace dart {

15	18

	19 DECLARE_FLAG(bool, trace_irregexp);

16	20

17 #define DEFINE_ACCEPT(Type) \	21 #define DEFINE_ACCEPT(Type) \

18 void Type##Node::Accept(NodeVisitor* visitor) { \	22 void Type##Node::Accept(NodeVisitor* visitor) { \

19 visitor->Visit##Type(this); \	23 visitor->Visit##Type(this); \

20 }	24 }

21 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)	25 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)

22 #undef DEFINE_ACCEPT	26 #undef DEFINE_ACCEPT

23	27

24	28

25 // Default to generating optimized regexp code.	29 // Default to generating optimized regexp code.

(...skipping 12 matching lines...) Expand all Loading...
38 static const intptr_t kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,	42 static const intptr_t kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,

39 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,	43 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,

40 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,	44 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,

41 0xFEFF, 0xFF00, 0x10000 };	45 0xFEFF, 0xFF00, 0x10000 };

42 static const intptr_t kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);	46 static const intptr_t kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);

43 static const intptr_t kWordRanges[] = {	47 static const intptr_t kWordRanges[] = {

44 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };	48 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };

45 static const intptr_t kWordRangeCount = ARRAY_SIZE(kWordRanges);	49 static const intptr_t kWordRangeCount = ARRAY_SIZE(kWordRanges);

46 static const intptr_t kDigitRanges[] = { '0', '9' + 1, 0x10000 };	50 static const intptr_t kDigitRanges[] = { '0', '9' + 1, 0x10000 };

47 static const intptr_t kDigitRangeCount = ARRAY_SIZE(kDigitRanges);	51 static const intptr_t kDigitRangeCount = ARRAY_SIZE(kDigitRanges);

	52 static const intptr_t kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };

	53 static const intptr_t kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges);

48 static const intptr_t kLineTerminatorRanges[] = {	54 static const intptr_t kLineTerminatorRanges[] = {

49 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, 0x10000 };	55 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, 0x10000 };

50 static const intptr_t kLineTerminatorRangeCount =	56 static const intptr_t kLineTerminatorRangeCount =

51 ARRAY_SIZE(kLineTerminatorRanges);	57 ARRAY_SIZE(kLineTerminatorRanges);

52	58

53	59

54 static inline void PrintUtf16(uint16_t c) {	60 static inline void PrintUtf16(uint16_t c) {

55 const char* format = (0x20 <= c && c <= 0x7F) ?	61 const char* format = (0x20 <= c && c <= 0x7F) ?

56 "%c" : (c <= 0xff) ? "\\x%02x" : "\\u%04x";	62 "%c" : (c <= 0xff) ? "\\x%02x" : "\\u%04x";

57 OS::Print(format, c);	63 OS::Print(format, c);

(...skipping 40 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
98 info->visited = true;	104 info->visited = true;

99 }	105 }

100 ~VisitMarker() {	106 ~VisitMarker() {

101 info_->visited = false;	107 info_->visited = false;

102 }	108 }

103 private:	109 private:

104 NodeInfo* info_;	110 NodeInfo* info_;

105 };	111 };

106	112

107	113

	114 class FrequencyCollator : public ValueObject {

	115 public:

	116 FrequencyCollator() : total_samples_(0) {

	117 for (intptr_t i = 0; i < RegExpMacroAssembler::kTableSize; i++) {

	118 frequencies_[i] = CharacterFrequency(i);

	119 }

	120 }

	121

	122 void CountCharacter(intptr_t character) {

	123 intptr_t index = (character & RegExpMacroAssembler::kTableMask);

	124 frequencies_[index].Increment();

	125 total_samples_++;

	126 }

	127

	128 // Does not measure in percent, but rather per-128 (the table size from the

	129 // regexp macro assembler).

	130 intptr_t Frequency(intptr_t in_character) {

	131 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);

	132 if (total_samples_ < 1) return 1; // Division by zero.

	133 intptr_t freq_in_per128 =

	134 (frequencies_[in_character].counter() * 128) / total_samples_;

	135 return freq_in_per128;

	136 }

	137

	138 private:

	139 class CharacterFrequency {

	140 public:

	141 CharacterFrequency() : counter_(0), character_(-1) { }

	142 explicit CharacterFrequency(intptr_t character)

	143 : counter_(0), character_(character) { }

	144

	145 void Increment() { counter_++; }

	146 intptr_t counter() { return counter_; }

	147 intptr_t character() { return character_; }

	148

	149 private:

	150 intptr_t counter_;

	151 intptr_t character_;

	152 };

	153

	154

	155 private:

	156 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];

	157 intptr_t total_samples_;

	158 };

	159

	160

108 class RegExpCompiler {	161 class RegExpCompiler {

109 public:	162 public:

110 RegExpCompiler(intptr_t capture_count, bool ignore_case, bool is_ascii);	163 RegExpCompiler(intptr_t capture_count,

	164 bool ignore_case,

	165 intptr_t specialization_cid);

111	166

112 intptr_t AllocateRegister() {	167 intptr_t AllocateRegister() {

113 return next_register_++;	168 return next_register_++;

114 }	169 }

115	170

116 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,	171 RegExpEngine::CompilationResult Assemble(IRRegExpMacroAssembler* assembler,

117 RegExpNode* start,	172 RegExpNode* start,

118 intptr_t capture_count,	173 intptr_t capture_count,

119 const String& pattern);	174 const String& pattern);

120	175

	176 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }

	177

121 static const intptr_t kImplementationOffset = 0;	178 static const intptr_t kImplementationOffset = 0;

122 static const intptr_t kNumberOfRegistersOffset = 0;	179 static const intptr_t kNumberOfRegistersOffset = 0;

123 static const intptr_t kCodeOffset = 1;	180 static const intptr_t kCodeOffset = 1;

124	181

125 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }	182 IRRegExpMacroAssembler* macro_assembler() { return macro_assembler_; }

126 EndNode* accept() { return accept_; }	183 EndNode* accept() { return accept_; }

127	184

128 static const intptr_t kMaxRecursion = 100;	185 static const intptr_t kMaxRecursion = 100;

129 inline intptr_t recursion_depth() { return recursion_depth_; }	186 inline intptr_t recursion_depth() { return recursion_depth_; }

130 inline void IncrementRecursionDepth() { recursion_depth_++; }	187 inline void IncrementRecursionDepth() { recursion_depth_++; }

131 inline void DecrementRecursionDepth() { recursion_depth_--; }	188 inline void DecrementRecursionDepth() { recursion_depth_--; }

132	189

133 void SetRegExpTooBig() { reg_exp_too_big_ = true; }	190 void SetRegExpTooBig() { reg_exp_too_big_ = true; }

134	191

135 inline bool ignore_case() { return ignore_case_; }	192 inline bool ignore_case() { return ignore_case_; }

136 inline bool ascii() { return ascii_; }	193 inline bool ascii() const {

	194 return (specialization_cid_ == kOneByteStringCid \|\|

	195 specialization_cid_ == kExternalOneByteStringCid);

	196 }

	197 inline intptr_t specialization_cid() { return specialization_cid_; }

	198 FrequencyCollator* frequency_collator() { return &frequency_collator_; }

137	199

138 intptr_t current_expansion_factor() { return current_expansion_factor_; }	200 intptr_t current_expansion_factor() { return current_expansion_factor_; }

139 void set_current_expansion_factor(intptr_t value) {	201 void set_current_expansion_factor(intptr_t value) {

140 current_expansion_factor_ = value;	202 current_expansion_factor_ = value;

141 }	203 }

142	204

143 Isolate* isolate() const { return isolate_; }	205 Isolate* isolate() const { return isolate_; }

144	206

145 static const intptr_t kNoRegister = -1;	207 static const intptr_t kNoRegister = -1;

146	208

147 private:	209 private:

148 EndNode* accept_;	210 EndNode* accept_;

149 intptr_t next_register_;	211 intptr_t next_register_;

	212 ZoneGrowableArray<RegExpNode> work_list_;

150 intptr_t recursion_depth_;	213 intptr_t recursion_depth_;

151 RegExpMacroAssembler* macro_assembler_;	214 IRRegExpMacroAssembler* macro_assembler_;

152 bool ignore_case_;	215 bool ignore_case_;

153 bool ascii_;	216 intptr_t specialization_cid_;

154 bool reg_exp_too_big_;	217 bool reg_exp_too_big_;

155 intptr_t current_expansion_factor_;	218 intptr_t current_expansion_factor_;

	219 FrequencyCollator frequency_collator_;

156 Isolate* isolate_;	220 Isolate* isolate_;

157 };	221 };

158	222

159	223

	224 class RecursionCheck : public ValueObject {

	225 public:

	226 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {

	227 compiler->IncrementRecursionDepth();

	228 }

	229 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }

	230 private:

	231 RegExpCompiler* compiler_;

	232 };

	233

	234

160 // Scoped object to keep track of how much we unroll quantifier loops in the	235 // Scoped object to keep track of how much we unroll quantifier loops in the

161 // regexp graph generator.	236 // regexp graph generator.

162 class RegExpExpansionLimiter {	237 class RegExpExpansionLimiter : public ValueObject {

163 public:	238 public:

164 static const intptr_t kMaxExpansionFactor = 6;	239 static const intptr_t kMaxExpansionFactor = 6;

165 RegExpExpansionLimiter(RegExpCompiler* compiler, intptr_t factor)	240 RegExpExpansionLimiter(RegExpCompiler* compiler, intptr_t factor)

166 : compiler_(compiler),	241 : compiler_(compiler),

167 saved_expansion_factor_(compiler->current_expansion_factor()),	242 saved_expansion_factor_(compiler->current_expansion_factor()),

168 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {	243 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {

169 ASSERT(factor > 0);	244 ASSERT(factor > 0);

170 if (ok_to_expand_) {	245 if (ok_to_expand_) {

171 if (factor > kMaxExpansionFactor) {	246 if (factor > kMaxExpansionFactor) {

172 // Avoid integer overflow of the current expansion factor.	247 // Avoid integer overflow of the current expansion factor.

(...skipping 531 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
704 // we don't need to check it.	779 // we don't need to check it.

705 if (neg_replacement == NULL) return set_replacement(replacement);	780 if (neg_replacement == NULL) return set_replacement(replacement);

706 (*alternatives_)[0].set_node(neg_replacement);	781 (*alternatives_)[0].set_node(neg_replacement);

707 return set_replacement(this);	782 return set_replacement(this);

708 }	783 }

709	784

710	785

711 // Code emission ---------------------------------------------------------------	786 // Code emission ---------------------------------------------------------------

712	787

713	788

714 #define DEFINE_EMIT(Type) \	789 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,

715 void Type##Node::Emit(RegExpCompiler* compiler, Trace* trace) { \	790 Guard* guard,

716 UNIMPLEMENTED(); \	791 Trace* trace) {

717 }	792 switch (guard->op()) {

718 FOR_EACH_NODE_TYPE(DEFINE_EMIT)	793 case Guard::LT:

719 DEFINE_EMIT(LoopChoice)	794 ASSERT(!trace->mentions_reg(guard->reg()));

720 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler,	795 macro_assembler->IfRegisterGE(guard->reg(),

721 Trace* trace) {	796 guard->value(),

	797 trace->backtrack());

	798 break;

	799 case Guard::GEQ:

	800 ASSERT(!trace->mentions_reg(guard->reg()));

	801 macro_assembler->IfRegisterLT(guard->reg(),

	802 guard->value(),

	803 trace->backtrack());

	804 break;

	805 }

	806 }

	807

	808

	809 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {

	810 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	811

	812 // Omit flushing the trace. We discard the entire stack frame anyway.

	813

	814 if (!label()->IsBound()) {

	815 // We are completely independent of the trace, since we ignore it,

	816 // so this code can be used as the generic version.

	817 assembler->BindBlock(label());

	818 }

	819

	820 // Throw away everything on the backtrack stack since the start

	821 // of the negative submatch and restore the character position.

	822 assembler->ReadCurrentPositionFromRegister(current_position_register_);

	823 assembler->ReadStackPointerFromRegister(stack_pointer_register_);

	824 if (clear_capture_count_ > 0) {

	825 // Clear any captures that might have been performed during the success

	826 // of the body of the negative look-ahead.

	827 intptr_t clear_capture_end =

	828 clear_capture_start_ + clear_capture_count_ - 1;

	829 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);

	830 }

	831 // Now that we have unwound the stack we find at the top of the stack the

	832 // backtrack that the BeginSubmatch node got.

	833 assembler->Backtrack();

	834 }

	835

	836

	837 bool Trace::GetStoredPosition(intptr_t reg, intptr_t* cp_offset) {

	838 ASSERT(*cp_offset == 0);

	839 for (DeferredAction* action = actions_;

	840 action != NULL;

	841 action = action->next()) {

	842 if (action->Mentions(reg)) {

	843 if (action->action_type() == ActionNode::STORE_POSITION) {

	844 cp_offset = static_cast<DeferredCapture>(action)->cp_offset();

	845 return true;

	846 } else {

	847 return false;

	848 }

	849 }

	850 }

	851 return false;

	852 }

	853

	854

	855 // This is called as we come into a loop choice node and some other tricky

	856 // nodes. It normalizes the state of the code generator to ensure we can

	857 // generate generic code.

	858 intptr_t Trace::FindAffectedRegisters(OutSet* affected_registers,

	859 Isolate* isolate) {

	860 intptr_t max_register = RegExpCompiler::kNoRegister;

	861 for (DeferredAction* action = actions_;

	862 action != NULL;

	863 action = action->next()) {

	864 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {

	865 Interval range = static_cast<DeferredClearCaptures*>(action)->range();

	866 for (intptr_t i = range.from(); i <= range.to(); i++)

	867 affected_registers->Set(i, isolate);

	868 if (range.to() > max_register) max_register = range.to();

	869 } else {

	870 affected_registers->Set(action->reg(), isolate);

	871 if (action->reg() > max_register) max_register = action->reg();

	872 }

	873 }

	874 return max_register;

	875 }

	876

	877

	878 bool Trace::DeferredAction::Mentions(intptr_t that) {

	879 if (action_type() == ActionNode::CLEAR_CAPTURES) {

	880 Interval range = static_cast<DeferredClearCaptures*>(this)->range();

	881 return range.Contains(that);

	882 } else {

	883 return reg() == that;

	884 }

	885 }

	886

	887

	888 bool Trace::mentions_reg(intptr_t reg) {

	889 for (DeferredAction* action = actions_;

	890 action != NULL;

	891 action = action->next()) {

	892 if (action->Mentions(reg))

	893 return true;

	894 }

	895 return false;

	896 }

	897

	898

	899 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,

	900 intptr_t max_register,

	901 const OutSet& registers_to_pop,

	902 const OutSet& registers_to_clear) {

	903 for (intptr_t reg = max_register; reg >= 0; reg--) {

	904 if (registers_to_pop.Get(reg)) {

	905 assembler->PopRegister(reg);

	906 } else if (registers_to_clear.Get(reg)) {

	907 intptr_t clear_to = reg;

	908 while (reg > 0 && registers_to_clear.Get(reg - 1)) {

	909 reg--;

	910 }

	911 assembler->ClearRegisters(reg, clear_to);

	912 }

	913 }

	914 }

	915

	916

	917 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,

	918 intptr_t max_register,

	919 const OutSet& affected_registers,

	920 OutSet* registers_to_pop,

	921 OutSet* registers_to_clear,

	922 Isolate* isolate) {

	923 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.

	924 const intptr_t push_limit = (assembler->stack_limit_slack() + 1) / 2;

	925

	926 // Count pushes performed to force a stack limit check occasionally.

	927 intptr_t pushes = 0;

	928

	929 for (intptr_t reg = 0; reg <= max_register; reg++) {

	930 if (!affected_registers.Get(reg)) {

	931 continue;

	932 }

	933

	934 // The chronologically first deferred action in the trace

	935 // is used to infer the action needed to restore a register

	936 // to its previous state (or not, if it's safe to ignore it).

	937 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };

	938 DeferredActionUndoType undo_action = IGNORE;

	939

	940 intptr_t value = 0;

	941 bool absolute = false;

	942 bool clear = false;

	943 intptr_t store_position = -1;

	944 // This is a little tricky because we are scanning the actions in reverse

	945 // historical order (newest first).

	946 for (DeferredAction* action = actions_;

	947 action != NULL;

	948 action = action->next()) {

	949 if (action->Mentions(reg)) {

	950 switch (action->action_type()) {

	951 case ActionNode::SET_REGISTER: {

	952 Trace::DeferredSetRegister* psr =

	953 static_cast<Trace::DeferredSetRegister*>(action);

	954 if (!absolute) {

	955 value += psr->value();

	956 absolute = true;

	957 }

	958 // SET_REGISTER is currently only used for newly introduced loop

	959 // counters. They can have a significant previous value if they

	960 // occour in a loop. TODO(lrn): Propagate this information, so

	961 // we can set undo_action to IGNORE if we know there is no value to

	962 // restore.

	963 undo_action = RESTORE;

	964 ASSERT(store_position == -1);

	965 ASSERT(!clear);

	966 break;

	967 }

	968 case ActionNode::INCREMENT_REGISTER:

	969 if (!absolute) {

	970 value++;

	971 }

	972 ASSERT(store_position == -1);

	973 ASSERT(!clear);

	974 undo_action = RESTORE;

	975 break;

	976 case ActionNode::STORE_POSITION: {

	977 Trace::DeferredCapture* pc =

	978 static_cast<Trace::DeferredCapture*>(action);

	979 if (!clear && store_position == -1) {

	980 store_position = pc->cp_offset();

	981 }

	982

	983 // For captures we know that stores and clears alternate.

	984 // Other register, are never cleared, and if the occur

	985 // inside a loop, they might be assigned more than once.

	986 if (reg <= 1) {

	987 // Registers zero and one, aka "capture zero", is

	988 // always set correctly if we succeed. There is no

	989 // need to undo a setting on backtrack, because we

	990 // will set it again or fail.

	991 undo_action = IGNORE;

	992 } else {

	993 undo_action = pc->is_capture() ? CLEAR : RESTORE;

	994 }

	995 ASSERT(!absolute);

	996 ASSERT(value == 0);

	997 break;

	998 }

	999 case ActionNode::CLEAR_CAPTURES: {

	1000 // Since we're scanning in reverse order, if we've already

	1001 // set the position we have to ignore historically earlier

	1002 // clearing operations.

	1003 if (store_position == -1) {

	1004 clear = true;

	1005 }

	1006 undo_action = RESTORE;

	1007 ASSERT(!absolute);

	1008 ASSERT(value == 0);

	1009 break;

	1010 }

	1011 default:

	1012 UNREACHABLE();

	1013 break;

	1014 }

	1015 }

	1016 }

	1017 // Prepare for the undo-action (e.g., push if it's going to be popped).

	1018 if (undo_action == RESTORE) {

	1019 pushes++;

	1020 RegExpMacroAssembler::StackCheckFlag stack_check =

	1021 RegExpMacroAssembler::kNoStackLimitCheck;

	1022 if (pushes == push_limit) {

	1023 stack_check = RegExpMacroAssembler::kCheckStackLimit;

	1024 pushes = 0;

	1025 }

	1026

	1027 assembler->PushRegister(reg, stack_check);

	1028 registers_to_pop->Set(reg, isolate);

	1029 } else if (undo_action == CLEAR) {

	1030 registers_to_clear->Set(reg, isolate);

	1031 }

	1032 // Perform the chronologically last action (or accumulated increment)

	1033 // for the register.

	1034 if (store_position != -1) {

	1035 assembler->WriteCurrentPositionToRegister(reg, store_position);

	1036 } else if (clear) {

	1037 assembler->ClearRegisters(reg, reg);

	1038 } else if (absolute) {

	1039 assembler->SetRegister(reg, value);

	1040 } else if (value != 0) {

	1041 assembler->AdvanceRegister(reg, value);

	1042 }

	1043 }

	1044 }

	1045

	1046

	1047 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {

	1048 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	1049

	1050 ASSERT(!is_trivial());

	1051

	1052 if (actions_ == NULL && backtrack() == NULL) {

	1053 // Here we just have some deferred cp advances to fix and we are back to

	1054 // a normal situation. We may also have to forget some information gained

	1055 // through a quick check that was already performed.

	1056 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);

	1057 // Create a new trivial state and generate the node with that.

	1058 Trace new_state;

	1059 successor->Emit(compiler, &new_state);

	1060 return;

	1061 }

	1062

	1063 // Generate deferred actions here along with code to undo them again.

	1064 OutSet affected_registers;

	1065

	1066 if (backtrack() != NULL) {

	1067 // Here we have a concrete backtrack location. These are set up by choice

	1068 // nodes and so they indicate that we have a deferred save of the current

	1069 // position which we may need to emit here.

	1070 assembler->PushCurrentPosition();

	1071 }

	1072

	1073 intptr_t max_register = FindAffectedRegisters(&affected_registers, CI);

	1074 OutSet registers_to_pop;

	1075 OutSet registers_to_clear;

	1076 PerformDeferredActions(assembler,

	1077 max_register,

	1078 affected_registers,

	1079 &registers_to_pop,

	1080 &registers_to_clear,

	1081 CI);

	1082 if (cp_offset_ != 0) {

	1083 assembler->AdvanceCurrentPosition(cp_offset_);

	1084 }

	1085

	1086 // Create a new trivial state and generate the node with that.

	1087 BlockLabel undo;

	1088 assembler->PushBacktrack(&undo);

	1089 Trace new_state;

	1090 successor->Emit(compiler, &new_state);

	1091

	1092 // On backtrack we need to restore state.

	1093 assembler->BindBlock(&undo);

	1094 RestoreAffectedRegisters(assembler,

	1095 max_register,

	1096 registers_to_pop,

	1097 registers_to_clear);

	1098 if (backtrack() == NULL) {

	1099 assembler->Backtrack();

	1100 } else {

	1101 assembler->PopCurrentPosition();

	1102 assembler->GoTo(backtrack());

	1103 }

	1104 }

	1105

	1106

	1107 void Trace::InvalidateCurrentCharacter() {

	1108 characters_preloaded_ = 0;

	1109 }

	1110

	1111

	1112 void Trace::AdvanceCurrentPositionInTrace(intptr_t by,

	1113 RegExpCompiler* compiler) {

	1114 ASSERT(by > 0);

	1115 // We don't have an instruction for shifting the current character register

	1116 // down or for using a shifted value for anything so lets just forget that

	1117 // we preloaded any characters into it.

	1118 characters_preloaded_ = 0;

	1119 // Adjust the offsets of the quick check performed information. This

	1120 // information is used to find out what we already determined about the

	1121 // characters by means of mask and compare.

	1122 quick_check_performed_.Advance(by, compiler->ascii());

	1123 cp_offset_ += by;

	1124 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {

	1125 compiler->SetRegExpTooBig();

	1126 cp_offset_ = 0;

	1127 }

	1128 bound_checked_up_to_ = Utils::Maximum(static_cast<intptr_t>(0),

	1129 bound_checked_up_to_ - by);

	1130 }

	1131

	1132

	1133 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {

	1134 if (!trace->is_trivial()) {

	1135 trace->Flush(compiler, this);

	1136 return;

	1137 }

	1138 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	1139 if (!label()->IsBound()) {

	1140 assembler->BindBlock(label());

	1141 }

	1142 switch (action_) {

	1143 case ACCEPT:

	1144 assembler->Succeed();

	1145 return;

	1146 case BACKTRACK:

	1147 assembler->GoTo(trace->backtrack());

	1148 return;

	1149 case NEGATIVE_SUBMATCH_SUCCESS:

	1150 // This case is handled in a different virtual method.

	1151 UNREACHABLE();

	1152 }

722 UNIMPLEMENTED();	1153 UNIMPLEMENTED();

723 }	1154 }

724 #undef DEFINE_EMIT	1155

725	1156

726	1157 bool QuickCheckDetails::Rationalize(bool asc) {

727 #define DEFINE_BMINFO(Type) \	1158 bool found_useful_op = false;

728 void Type##Node::FillInBMInfo(intptr_t initial_offset, \	1159 uint32_t char_mask;

729 intptr_t budget, \	1160 if (asc) {

730 BoyerMooreLookahead* bm, \	1161 char_mask = Symbols::kMaxOneCharCodeSymbol;

731 bool not_at_start) { \	1162 } else {

732 UNIMPLEMENTED(); \	1163 char_mask = Utf16::kMaxCodeUnit;

733 }	1164 }

734 FOR_EACH_NODE_TYPE(DEFINE_BMINFO)	1165 mask_ = 0;

735 DEFINE_BMINFO(LoopChoice)	1166 value_ = 0;

736 #undef DEFINE_BMINFO	1167 intptr_t char_shift = 0;

	1168 for (intptr_t i = 0; i < characters_; i++) {

	1169 Position* pos = &positions_[i];

	1170 if ((pos->mask & Symbols::kMaxOneCharCodeSymbol) != 0) {

	1171 found_useful_op = true;

	1172 }

	1173 mask_ \|= (pos->mask & char_mask) << char_shift;

	1174 value_ \|= (pos->value & char_mask) << char_shift;

	1175 char_shift += asc ? 8 : 16;

	1176 }

	1177 return found_useful_op;

	1178 }

	1179

	1180

	1181 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,

	1182 Trace* trace,

	1183 bool preload_has_checked_bounds,

	1184 BlockLabel* on_possible_success,

	1185 QuickCheckDetails* details,

	1186 bool fall_through_on_failure) {

	1187 if (details->characters() == 0) return false;

	1188 GetQuickCheckDetails(

	1189 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);

	1190 if (details->cannot_match()) return false;

	1191 if (!details->Rationalize(compiler->ascii())) return false;

	1192 ASSERT(details->characters() == 1 \|\|

	1193 compiler->macro_assembler()->CanReadUnaligned());

	1194 uint32_t mask = details->mask();

	1195 uint32_t value = details->value();

	1196

	1197 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	1198

	1199 if (trace->characters_preloaded() != details->characters()) {

	1200 assembler->LoadCurrentCharacter(trace->cp_offset(),

	1201 trace->backtrack(),

	1202 !preload_has_checked_bounds,

	1203 details->characters());

	1204 }

	1205

	1206

	1207 bool need_mask = true;

	1208

	1209 if (details->characters() == 1) {

	1210 // If number of characters preloaded is 1 then we used a byte or 16 bit

	1211 // load so the value is already masked down.

	1212 uint32_t char_mask;

	1213 if (compiler->ascii()) {

	1214 char_mask = Symbols::kMaxOneCharCodeSymbol;

	1215 } else {

	1216 char_mask = Utf16::kMaxCodeUnit;

	1217 }

	1218 if ((mask & char_mask) == char_mask) need_mask = false;

	1219 mask &= char_mask;

	1220 } else {

	1221 // For 2-character preloads in ASCII mode or 1-character preloads in

	1222 // TWO_BYTE mode we also use a 16 bit load with zero extend.

	1223 if (details->characters() == 2 && compiler->ascii()) {

	1224 if ((mask & 0xffff) == 0xffff) need_mask = false;

	1225 } else if (details->characters() == 1 && !compiler->ascii()) {

	1226 if ((mask & 0xffff) == 0xffff) need_mask = false;

	1227 } else {

	1228 if (mask == 0xffffffff) need_mask = false;

	1229 }

	1230 }

	1231

	1232 if (fall_through_on_failure) {

	1233 if (need_mask) {

	1234 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);

	1235 } else {

	1236 assembler->CheckCharacter(value, on_possible_success);

	1237 }

	1238 } else {

	1239 if (need_mask) {

	1240 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());

	1241 } else {

	1242 assembler->CheckNotCharacter(value, trace->backtrack());

	1243 }

	1244 }

	1245 return true;

	1246 }

	1247

	1248

	1249 // Emit the code to check for a ^ in multiline mode (1-character lookbehind

	1250 // that matches newline or the start of input).

	1251 static void EmitHat(RegExpCompiler* compiler,

	1252 RegExpNode* on_success,

	1253 Trace* trace) {

	1254 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	1255 // We will be loading the previous character into the current character

	1256 // register.

	1257 Trace new_trace(*trace);

	1258 new_trace.InvalidateCurrentCharacter();

	1259

	1260 BlockLabel ok;

	1261 if (new_trace.cp_offset() == 0) {

	1262 // The start of input counts as a newline in this context, so skip to

	1263 // ok if we are at the start.

	1264 assembler->CheckAtStart(&ok);

	1265 }

	1266 // We already checked that we are not at the start of input so it must be

	1267 // OK to load the previous character.

	1268 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,

	1269 new_trace.backtrack(),

	1270 false);

	1271 if (!assembler->CheckSpecialCharacterClass('n',

	1272 new_trace.backtrack())) {

	1273 // Newline means \n, \r, 0x2028 or 0x2029.

	1274 if (!compiler->ascii()) {

	1275 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);

	1276 }

	1277 assembler->CheckCharacter('\n', &ok);

	1278 assembler->CheckNotCharacter('\r', new_trace.backtrack());

	1279 }

	1280 assembler->BindBlock(&ok);

	1281 on_success->Emit(compiler, &new_trace);

	1282 }

	1283

	1284

	1285 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {

	1286 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	1287 switch (assertion_type_) {

	1288 case AT_END: {

	1289 BlockLabel ok;

	1290 assembler->CheckPosition(trace->cp_offset(), &ok);

	1291 assembler->GoTo(trace->backtrack());

	1292 assembler->BindBlock(&ok);

	1293 break;

	1294 }

	1295 case AT_START: {

	1296 if (trace->at_start() == Trace::FALSE_VALUE) {

	1297 assembler->GoTo(trace->backtrack());

	1298 return;

	1299 }

	1300 if (trace->at_start() == Trace::UNKNOWN) {

	1301 assembler->CheckNotAtStart(trace->backtrack());

	1302 Trace at_start_trace = *trace;

	1303 at_start_trace.set_at_start(true);

	1304 on_success()->Emit(compiler, &at_start_trace);

	1305 return;

	1306 }

	1307 }

	1308 break;

	1309 case AFTER_NEWLINE:

	1310 EmitHat(compiler, on_success(), trace);

	1311 return;

	1312 case AT_BOUNDARY:

	1313 case AT_NON_BOUNDARY: {

	1314 EmitBoundaryCheck(compiler, trace);

	1315 return;

	1316 }

	1317 }

	1318 on_success()->Emit(compiler, trace);

	1319 }

	1320

	1321

	1322 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,

	1323 Trace* trace) {

	1324 // If we are generating a greedy loop then don't stop and don't reuse code.

	1325 if (trace->stop_node() != NULL) {

	1326 return CONTINUE;

	1327 }

	1328

	1329 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	1330 if (trace->is_trivial()) {

	1331 if (label_.IsBound()) {

	1332 // We are being asked to generate a generic version, but that's already

	1333 // been done so just go to it.

	1334 macro_assembler->GoTo(&label_);

	1335 return DONE;

	1336 }

	1337 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {

	1338 // To avoid too deep recursion we push the node to the work queue and just

	1339 // generate a goto here.

	1340 compiler->AddWork(this);

	1341 macro_assembler->GoTo(&label_);

	1342 return DONE;

	1343 }

	1344 // Generate generic version of the node and bind the label for later use.

	1345 macro_assembler->BindBlock(&label_);

	1346 return CONTINUE;

	1347 }

	1348

	1349 // We are being asked to make a non-generic version. Keep track of how many

	1350 // non-generic versions we generate so as not to overdo it.

	1351 trace_count_++;

	1352 if (kRegexpOptimization &&

	1353 trace_count_ < kMaxCopiesCodeGenerated &&

	1354 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {

	1355 return CONTINUE;

	1356 }

	1357

	1358 // If we get here code has been generated for this node too many times or

	1359 // recursion is too deep. Time to switch to a generic version. The code for

	1360 // generic versions above can handle deep recursion properly.

	1361 trace->Flush(compiler, this);

	1362 return DONE;

	1363 }

	1364

	1365

	1366 // This generates the code to match a text node. A text node can contain

	1367 // straight character sequences (possibly to be matched in a case-independent

	1368 // way) and character classes. For efficiency we do not do this in a single

	1369 // pass from left to right. Instead we pass over the text node several times,

	1370 // emitting code for some character positions every time. See the comment on

	1371 // TextEmitPass for details.

	1372 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {

	1373 LimitResult limit_result = LimitVersions(compiler, trace);

	1374 if (limit_result == DONE) return;

	1375 ASSERT(limit_result == CONTINUE);

	1376

	1377 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {

	1378 compiler->SetRegExpTooBig();

	1379 return;

	1380 }

	1381

	1382 if (compiler->ascii()) {

	1383 intptr_t dummy = 0;

	1384 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);

	1385 }

	1386

	1387 bool first_elt_done = false;

	1388 intptr_t bound_checked_to = trace->cp_offset() - 1;

	1389 bound_checked_to += trace->bound_checked_up_to();

	1390

	1391 // If a character is preloaded into the current character register then

	1392 // check that now.

	1393 if (trace->characters_preloaded() == 1) {

	1394 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {

	1395 if (!SkipPass(pass, compiler->ignore_case())) {

	1396 TextEmitPass(compiler,

	1397 static_cast<TextEmitPassType>(pass),

	1398 true,

	1399 trace,

	1400 false,

	1401 &bound_checked_to);

	1402 }

	1403 }

	1404 first_elt_done = true;

	1405 }

	1406

	1407 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {

	1408 if (!SkipPass(pass, compiler->ignore_case())) {

	1409 TextEmitPass(compiler,

	1410 static_cast<TextEmitPassType>(pass),

	1411 false,

	1412 trace,

	1413 first_elt_done,

	1414 &bound_checked_to);

	1415 }

	1416 }

	1417

	1418 Trace successor_trace(*trace);

	1419 successor_trace.set_at_start(false);

	1420 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);

	1421 RecursionCheck rc(compiler);

	1422 on_success()->Emit(compiler, &successor_trace);

	1423 }

	1424

	1425

	1426 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {

	1427 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	1428 if (trace->stop_node() == this) {

	1429 intptr_t text_length =

	1430 GreedyLoopTextLengthForAlternative(&((*alternatives_)[0]));

	1431 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);

	1432 // Update the counter-based backtracking info on the stack. This is an

	1433 // optimization for greedy loops (see below).

	1434 ASSERT(trace->cp_offset() == text_length);

	1435 macro_assembler->AdvanceCurrentPosition(text_length);

	1436 macro_assembler->GoTo(trace->loop_label());

	1437 return;

	1438 }

	1439 ASSERT(trace->stop_node() == NULL);

	1440 if (!trace->is_trivial()) {

	1441 trace->Flush(compiler, this);

	1442 return;

	1443 }

	1444 ChoiceNode::Emit(compiler, trace);

	1445 }

	1446

	1447

	1448 // This class is used when generating the alternatives in a choice node. It

	1449 // records the way the alternative is being code generated.

	1450 class AlternativeGeneration {

	1451 public:

	1452 AlternativeGeneration()

	1453 : possible_success(),

	1454 expects_preload(false),

	1455 after(),

	1456 quick_check_details() { }

	1457 BlockLabel possible_success;

	1458 bool expects_preload;

	1459 BlockLabel after;

	1460 QuickCheckDetails quick_check_details;

	1461 };

	1462

	1463

	1464 // Creates a list of AlternativeGenerations. If the list has a reasonable

	1465 // size then it is on the stack, otherwise the excess is on the heap.

	1466 class AlternativeGenerationList {

	1467 public:

	1468 explicit AlternativeGenerationList(intptr_t count)

	1469 : alt_gens_(count) {

	1470 for (intptr_t i = 0; i < count && i < kAFew; i++) {

	1471 alt_gens_.Add(a_few_alt_gens_ + i);

	1472 }

	1473 for (intptr_t i = kAFew; i < count; i++) {

	1474 alt_gens_.Add(new AlternativeGeneration());

	1475 }

	1476 }

	1477 ~AlternativeGenerationList() {

	1478 for (intptr_t i = kAFew; i < alt_gens_.length(); i++) {

	1479 delete alt_gens_[i];

	1480 alt_gens_[i] = NULL;

	1481 }

	1482 }

	1483

	1484 AlternativeGeneration* at(intptr_t i) {

	1485 return alt_gens_[i];

	1486 }

	1487

	1488 private:

	1489 static const intptr_t kAFew = 10;

	1490 GrowableArray<AlternativeGeneration*> alt_gens_;

	1491 AlternativeGeneration a_few_alt_gens_[kAFew];

	1492 };

	1493

	1494

	1495 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {

	1496 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	1497 intptr_t choice_count = alternatives_->length();

	1498

	1499 #ifdef DEBUG

	1500 for (intptr_t i = 0; i < choice_count - 1; i++) {

	1501 GuardedAlternative alternative = alternatives_->At(i);

	1502 ZoneGrowableArray<Guard> guards = alternative.guards();

	1503 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();

	1504 for (intptr_t j = 0; j < guard_count; j++) {

	1505 ASSERT(!trace->mentions_reg(guards->At(j)->reg()));

	1506 }

	1507 }

	1508 #endif

	1509

	1510 LimitResult limit_result = LimitVersions(compiler, trace);

	1511 if (limit_result == DONE) return;

	1512 ASSERT(limit_result == CONTINUE);

	1513

	1514 intptr_t new_flush_budget = trace->flush_budget() / choice_count;

	1515 if (trace->flush_budget() == 0 && trace->actions() != NULL) {

	1516 trace->Flush(compiler, this);

	1517 return;

	1518 }

	1519

	1520 RecursionCheck rc(compiler);

	1521

	1522 Trace* current_trace = trace;

	1523

	1524 intptr_t text_length =

	1525 GreedyLoopTextLengthForAlternative(&((*alternatives_)[0]));

	1526 bool greedy_loop = false;

	1527 BlockLabel greedy_loop_label;

	1528 Trace counter_backtrack_trace;

	1529 counter_backtrack_trace.set_backtrack(&greedy_loop_label);

	1530 if (not_at_start()) counter_backtrack_trace.set_at_start(false);

	1531

	1532 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {

	1533 // Here we have special handling for greedy loops containing only text nodes

	1534 // and other simple nodes. These are handled by pushing the current

	1535 // position on the stack and then incrementing the current position each

	1536 // time around the switch. On backtrack we decrement the current position

	1537 // and check it against the pushed value. This avoids pushing backtrack

	1538 // information for each iteration of the loop, which could take up a lot of

	1539 // space.

	1540 greedy_loop = true;

	1541 ASSERT(trace->stop_node() == NULL);

	1542 macro_assembler->PushCurrentPosition();

	1543 current_trace = &counter_backtrack_trace;

	1544 BlockLabel greedy_match_failed;

	1545 Trace greedy_match_trace;

	1546 if (not_at_start()) greedy_match_trace.set_at_start(false);

	1547 greedy_match_trace.set_backtrack(&greedy_match_failed);

	1548 BlockLabel loop_label;

	1549 macro_assembler->BindBlock(&loop_label);

	1550 greedy_match_trace.set_stop_node(this);

	1551 greedy_match_trace.set_loop_label(&loop_label);

	1552 (*alternatives_)[0].node()->Emit(compiler, &greedy_match_trace);

	1553 macro_assembler->BindBlock(&greedy_match_failed);

	1554 }

	1555

	1556 BlockLabel second_choice; // For use in greedy matches.

	1557 macro_assembler->BindBlock(&second_choice);

	1558

	1559 intptr_t first_normal_choice = greedy_loop ? 1 : 0;

	1560

	1561 bool not_at_start = current_trace->at_start() == Trace::FALSE_VALUE;

	1562 const intptr_t kEatsAtLeastNotYetInitialized = -1;

	1563 intptr_t eats_at_least = kEatsAtLeastNotYetInitialized;

	1564

	1565 bool skip_was_emitted = false;

	1566

	1567 if (!greedy_loop && choice_count == 2) {

	1568 GuardedAlternative alt1 = (*alternatives_)[1];

	1569 if (alt1.guards() == NULL \|\| alt1.guards()->length() == 0) {

	1570 RegExpNode* eats_anything_node = alt1.node();

	1571 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==

	1572 this) {

	1573 // At this point we know that we are at a non-greedy loop that will eat

	1574 // any character one at a time. Any non-anchored regexp has such a

	1575 // loop prepended to it in order to find where it starts. We look for

	1576 // a pattern of the form ...abc... where we can look 6 characters ahead

	1577 // and step forwards 3 if the character is not one of abc. Abc need

	1578 // not be atoms, they can be any reasonably limited character class or

	1579 // small alternation.

	1580 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.

	1581 BoyerMooreLookahead* lookahead = bm_info(not_at_start);

	1582 if (lookahead == NULL) {

	1583 eats_at_least = Utils::Minimum(kMaxLookaheadForBoyerMoore,

	1584 EatsAtLeast(kMaxLookaheadForBoyerMoore,

	1585 kRecursionBudget,

	1586 not_at_start));

	1587 if (eats_at_least >= 1) {

	1588 BoyerMooreLookahead* bm =

	1589 new(I) BoyerMooreLookahead(eats_at_least, compiler, I);

	1590 GuardedAlternative alt0 = alternatives_->At(0);

	1591 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, not_at_start);

	1592 skip_was_emitted = bm->EmitSkipInstructions(macro_assembler);

	1593 }

	1594 } else {

	1595 skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler);

	1596 }

	1597 }

	1598 }

	1599 }

	1600

	1601 if (eats_at_least == kEatsAtLeastNotYetInitialized) {

	1602 // Save some time by looking at most one machine word ahead.

	1603 eats_at_least =

	1604 EatsAtLeast(compiler->ascii() ? 4 : 2, kRecursionBudget, not_at_start);

	1605 }

	1606 intptr_t preload_characters =

	1607 CalculatePreloadCharacters(compiler, eats_at_least);

	1608

	1609 bool preload_is_current = !skip_was_emitted &&

	1610 (current_trace->characters_preloaded() == preload_characters);

	1611 bool preload_has_checked_bounds = preload_is_current;

	1612

	1613 AlternativeGenerationList alt_gens(choice_count);

	1614

	1615 // For now we just call all choices one after the other. The idea ultimately

	1616 // is to use the Dispatch table to try only the relevant ones.

	1617 for (intptr_t i = first_normal_choice; i < choice_count; i++) {

	1618 GuardedAlternative alternative = alternatives_->At(i);

	1619 AlternativeGeneration* alt_gen = alt_gens.at(i);

	1620 alt_gen->quick_check_details.set_characters(preload_characters);

	1621 ZoneGrowableArray<Guard> guards = alternative.guards();

	1622 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();

	1623 Trace new_trace(*current_trace);

	1624 new_trace.set_characters_preloaded(preload_is_current ?

	1625 preload_characters :

	1626 0);

	1627 if (preload_has_checked_bounds) {

	1628 new_trace.set_bound_checked_up_to(preload_characters);

	1629 }

	1630 new_trace.quick_check_performed()->Clear();

	1631 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);

	1632 alt_gen->expects_preload = preload_is_current;

	1633 bool generate_full_check_inline = false;

	1634 if (kRegexpOptimization &&

	1635 try_to_emit_quick_check_for_alternative(i) &&

	1636 alternative.node()->EmitQuickCheck(compiler,

	1637 &new_trace,

	1638 preload_has_checked_bounds,

	1639 &alt_gen->possible_success,

	1640 &alt_gen->quick_check_details,

	1641 i < choice_count - 1)) {

	1642 // Quick check was generated for this choice.

	1643 preload_is_current = true;

	1644 preload_has_checked_bounds = true;

	1645 // On the last choice in the ChoiceNode we generated the quick

	1646 // check to fall through on possible success. So now we need to

	1647 // generate the full check inline.

	1648 if (i == choice_count - 1) {

	1649 macro_assembler->BindBlock(&alt_gen->possible_success);

	1650 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);

	1651 new_trace.set_characters_preloaded(preload_characters);

	1652 new_trace.set_bound_checked_up_to(preload_characters);

	1653 generate_full_check_inline = true;

	1654 }

	1655 } else if (alt_gen->quick_check_details.cannot_match()) {

	1656 if (i == choice_count - 1 && !greedy_loop) {

	1657 macro_assembler->GoTo(trace->backtrack());

	1658 }

	1659 continue;

	1660 } else {

	1661 // No quick check was generated. Put the full code here.

	1662 // If this is not the first choice then there could be slow checks from

	1663 // previous cases that go here when they fail. There's no reason to

	1664 // insist that they preload characters since the slow check we are about

	1665 // to generate probably can't use it.

	1666 if (i != first_normal_choice) {

	1667 alt_gen->expects_preload = false;

	1668 new_trace.InvalidateCurrentCharacter();

	1669 }

	1670 if (i < choice_count - 1) {

	1671 new_trace.set_backtrack(&alt_gen->after);

	1672 }

	1673 generate_full_check_inline = true;

	1674 }

	1675 if (generate_full_check_inline) {

	1676 if (new_trace.actions() != NULL) {

	1677 new_trace.set_flush_budget(new_flush_budget);

	1678 }

	1679 for (intptr_t j = 0; j < guard_count; j++) {

	1680 GenerateGuard(macro_assembler, guards->At(j), &new_trace);

	1681 }

	1682 alternative.node()->Emit(compiler, &new_trace);

	1683 preload_is_current = false;

	1684 }

	1685 macro_assembler->BindBlock(&alt_gen->after);

	1686 }

	1687 if (greedy_loop) {

	1688 macro_assembler->BindBlock(&greedy_loop_label);

	1689 // If we have unwound to the bottom then backtrack.

	1690 macro_assembler->CheckGreedyLoop(trace->backtrack());

	1691 // Otherwise try the second priority at an earlier position.

	1692 macro_assembler->AdvanceCurrentPosition(-text_length);

	1693 macro_assembler->GoTo(&second_choice);

	1694 }

	1695

	1696 // At this point we need to generate slow checks for the alternatives where

	1697 // the quick check was inlined. We can recognize these because the associated

	1698 // label was bound.

	1699 for (intptr_t i = first_normal_choice; i < choice_count - 1; i++) {

	1700 AlternativeGeneration* alt_gen = alt_gens.at(i);

	1701 Trace new_trace(*current_trace);

	1702 // If there are actions to be flushed we have to limit how many times

	1703 // they are flushed. Take the budget of the parent trace and distribute

	1704 // it fairly amongst the children.

	1705 if (new_trace.actions() != NULL) {

	1706 new_trace.set_flush_budget(new_flush_budget);

	1707 }

	1708 EmitOutOfLineContinuation(compiler,

	1709 &new_trace,

	1710 alternatives_->At(i),

	1711 alt_gen,

	1712 preload_characters,

	1713 alt_gens.at(i + 1)->expects_preload);

	1714 }

	1715 }

	1716

	1717

	1718 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,

	1719 Trace* trace,

	1720 GuardedAlternative alternative,

	1721 AlternativeGeneration* alt_gen,

	1722 intptr_t preload_characters,

	1723 bool next_expects_preload) {

	1724 if (!alt_gen->possible_success.IsLinked()) return;

	1725

	1726 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	1727 macro_assembler->BindBlock(&alt_gen->possible_success);

	1728 Trace out_of_line_trace(*trace);

	1729 out_of_line_trace.set_characters_preloaded(preload_characters);

	1730 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);

	1731 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);

	1732 ZoneGrowableArray<Guard> guards = alternative.guards();

	1733 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();

	1734 if (next_expects_preload) {

	1735 BlockLabel reload_current_char;

	1736 out_of_line_trace.set_backtrack(&reload_current_char);

	1737 for (intptr_t j = 0; j < guard_count; j++) {

	1738 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);

	1739 }

	1740 alternative.node()->Emit(compiler, &out_of_line_trace);

	1741 macro_assembler->BindBlock(&reload_current_char);

	1742 // Reload the current character, since the next quick check expects that.

	1743 // We don't need to check bounds here because we only get into this

	1744 // code through a quick check which already did the checked load.

	1745 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),

	1746 NULL,

	1747 false,

	1748 preload_characters);

	1749 macro_assembler->GoTo(&(alt_gen->after));

	1750 } else {

	1751 out_of_line_trace.set_backtrack(&(alt_gen->after));

	1752 for (intptr_t j = 0; j < guard_count; j++) {

	1753 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);

	1754 }

	1755 alternative.node()->Emit(compiler, &out_of_line_trace);

	1756 }

	1757 }

	1758

	1759

	1760 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {

	1761 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	1762 LimitResult limit_result = LimitVersions(compiler, trace);

	1763 if (limit_result == DONE) return;

	1764 ASSERT(limit_result == CONTINUE);

	1765

	1766 RecursionCheck rc(compiler);

	1767

	1768 switch (action_type_) {

	1769 case STORE_POSITION: {

	1770 Trace::DeferredCapture

	1771 new_capture(data_.u_position_register.reg,

	1772 data_.u_position_register.is_capture,

	1773 trace);

	1774 Trace new_trace = *trace;

	1775 new_trace.add_action(&new_capture);

	1776 on_success()->Emit(compiler, &new_trace);

	1777 break;

	1778 }

	1779 case INCREMENT_REGISTER: {

	1780 Trace::DeferredIncrementRegister

	1781 new_increment(data_.u_increment_register.reg);

	1782 Trace new_trace = *trace;

	1783 new_trace.add_action(&new_increment);

	1784 on_success()->Emit(compiler, &new_trace);

	1785 break;

	1786 }

	1787 case SET_REGISTER: {

	1788 Trace::DeferredSetRegister

	1789 new_set(data_.u_store_register.reg, data_.u_store_register.value);

	1790 Trace new_trace = *trace;

	1791 new_trace.add_action(&new_set);

	1792 on_success()->Emit(compiler, &new_trace);

	1793 break;

	1794 }

	1795 case CLEAR_CAPTURES: {

	1796 Trace::DeferredClearCaptures

	1797 new_capture(Interval(data_.u_clear_captures.range_from,

	1798 data_.u_clear_captures.range_to));

	1799 Trace new_trace = *trace;

	1800 new_trace.add_action(&new_capture);

	1801 on_success()->Emit(compiler, &new_trace);

	1802 break;

	1803 }

	1804 case BEGIN_SUBMATCH:

	1805 if (!trace->is_trivial()) {

	1806 trace->Flush(compiler, this);

	1807 } else {

	1808 assembler->WriteCurrentPositionToRegister(

	1809 data_.u_submatch.current_position_register, 0);

	1810 assembler->WriteStackPointerToRegister(

	1811 data_.u_submatch.stack_pointer_register);

	1812 on_success()->Emit(compiler, trace);

	1813 }

	1814 break;

	1815 case EMPTY_MATCH_CHECK: {

	1816 intptr_t start_pos_reg = data_.u_empty_match_check.start_register;

	1817 intptr_t stored_pos = 0;

	1818 intptr_t rep_reg = data_.u_empty_match_check.repetition_register;

	1819 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);

	1820 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);

	1821 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {

	1822 // If we know we haven't advanced and there is no minimum we

	1823 // can just backtrack immediately.

	1824 assembler->GoTo(trace->backtrack());

	1825 } else if (know_dist && stored_pos < trace->cp_offset()) {

	1826 // If we know we've advanced we can generate the continuation

	1827 // immediately.

	1828 on_success()->Emit(compiler, trace);

	1829 } else if (!trace->is_trivial()) {

	1830 trace->Flush(compiler, this);

	1831 } else {

	1832 BlockLabel skip_empty_check;

	1833 // If we have a minimum number of repetitions we check the current

	1834 // number first and skip the empty check if it's not enough.

	1835 if (has_minimum) {

	1836 intptr_t limit = data_.u_empty_match_check.repetition_limit;

	1837 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);

	1838 }

	1839 // If the match is empty we bail out, otherwise we fall through

	1840 // to the on-success continuation.

	1841 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,

	1842 trace->backtrack());

	1843 assembler->BindBlock(&skip_empty_check);

	1844 on_success()->Emit(compiler, trace);

	1845 }

	1846 break;

	1847 }

	1848 case POSITIVE_SUBMATCH_SUCCESS: {

	1849 if (!trace->is_trivial()) {

	1850 trace->Flush(compiler, this);

	1851 return;

	1852 }

	1853 assembler->ReadCurrentPositionFromRegister(

	1854 data_.u_submatch.current_position_register);

	1855 assembler->ReadStackPointerFromRegister(

	1856 data_.u_submatch.stack_pointer_register);

	1857 intptr_t clear_register_count = data_.u_submatch.clear_register_count;

	1858 if (clear_register_count == 0) {

	1859 on_success()->Emit(compiler, trace);

	1860 return;

	1861 }

	1862 intptr_t clear_registers_from = data_.u_submatch.clear_register_from;

	1863 BlockLabel clear_registers_backtrack;

	1864 Trace new_trace = *trace;

	1865 new_trace.set_backtrack(&clear_registers_backtrack);

	1866 on_success()->Emit(compiler, &new_trace);

	1867

	1868 assembler->BindBlock(&clear_registers_backtrack);

	1869 intptr_t clear_registers_to =

	1870 clear_registers_from + clear_register_count - 1;

	1871 assembler->ClearRegisters(clear_registers_from, clear_registers_to);

	1872

	1873 ASSERT(trace->backtrack() == NULL);

	1874 assembler->Backtrack();

	1875 return;

	1876 }

	1877 default:

	1878 UNREACHABLE();

	1879 }

	1880 }

	1881

	1882

	1883 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {

	1884 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	1885 if (!trace->is_trivial()) {

	1886 trace->Flush(compiler, this);

	1887 return;

	1888 }

	1889

	1890 LimitResult limit_result = LimitVersions(compiler, trace);

	1891 if (limit_result == DONE) return;

	1892 ASSERT(limit_result == CONTINUE);

	1893

	1894 RecursionCheck rc(compiler);

	1895

	1896 ASSERT(start_reg_ + 1 == end_reg_);

	1897 if (compiler->ignore_case()) {

	1898 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,

	1899 trace->backtrack());

	1900 } else {

	1901 assembler->CheckNotBackReference(start_reg_, trace->backtrack());

	1902 }

	1903 on_success()->Emit(compiler, trace);

	1904 }

	1905

	1906

	1907 void ActionNode::FillInBMInfo(intptr_t offset,

	1908 intptr_t budget,

	1909 BoyerMooreLookahead* bm,

	1910 bool not_at_start) {

	1911 if (action_type_ == BEGIN_SUBMATCH) {

	1912 bm->SetRest(offset);

	1913 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {

	1914 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);

	1915 }

	1916 SaveBMInfo(bm, not_at_start, offset);

	1917 }

	1918

	1919

	1920 void AssertionNode::FillInBMInfo(intptr_t offset,

	1921 intptr_t budget,

	1922 BoyerMooreLookahead* bm,

	1923 bool not_at_start) {

	1924 // Match the behaviour of EatsAtLeast on this node.

	1925 if (assertion_type() == AT_START && not_at_start) return;

	1926 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);

	1927 SaveBMInfo(bm, not_at_start, offset);

	1928 }

	1929

	1930

	1931 void BackReferenceNode::FillInBMInfo(intptr_t offset,

	1932 intptr_t budget,

	1933 BoyerMooreLookahead* bm,

	1934 bool not_at_start) {

	1935 // Working out the set of characters that a backreference can match is too

	1936 // hard, so we just say that any character can match.

	1937 bm->SetRest(offset);

	1938 SaveBMInfo(bm, not_at_start, offset);

	1939 }

	1940

	1941

	1942 // Returns the number of characters in the equivalence class, omitting those

	1943 // that cannot occur in the source string because it is ASCII.

	1944 static intptr_t GetCaseIndependentLetters(uint16_t character,

	1945 bool ascii_subject,

	1946 int32_t* letters) {

	1947 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize;

	1948 intptr_t length = jsregexp_uncanonicalize.get(character, '\0', letters);

	1949 // Unibrow returns 0 or 1 for characters where case independence is

	1950 // trivial.

	1951 if (length == 0) {

	1952 letters[0] = character;

	1953 length = 1;

	1954 }

	1955 if (!ascii_subject \|\| character <= Symbols::kMaxOneCharCodeSymbol) {

	1956 return length;

	1957 }

	1958 // The standard requires that non-ASCII characters cannot have ASCII

	1959 // character codes in their equivalence class.

	1960 return 0;

	1961 }

	1962

	1963

	1964 void ChoiceNode::FillInBMInfo(intptr_t offset,

	1965 intptr_t budget,

	1966 BoyerMooreLookahead* bm,

	1967 bool not_at_start) {

	1968 ZoneGrowableArray<GuardedAlternative>* alts = alternatives();

	1969 budget = (budget - 1) / alts->length();

	1970 for (intptr_t i = 0; i < alts->length(); i++) {

	1971 GuardedAlternative& alt = (*alts)[i];

	1972 if (alt.guards() != NULL && alt.guards()->length() != 0) {

	1973 bm->SetRest(offset); // Give up trying to fill in info.

	1974 SaveBMInfo(bm, not_at_start, offset);

	1975 return;

	1976 }

	1977 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start);

	1978 }

	1979 SaveBMInfo(bm, not_at_start, offset);

	1980 }

	1981

	1982

	1983 void EndNode::FillInBMInfo(intptr_t offset,

	1984 intptr_t budget,

	1985 BoyerMooreLookahead* bm,

	1986 bool not_at_start) {

	1987 // Returning 0 from EatsAtLeast should ensure we never get here.

	1988 UNREACHABLE();

	1989 }

	1990

	1991

	1992 void LoopChoiceNode::FillInBMInfo(intptr_t offset,

	1993 intptr_t budget,

	1994 BoyerMooreLookahead* bm,

	1995 bool not_at_start) {

	1996 if (body_can_be_zero_length_ \|\| budget <= 0) {

	1997 bm->SetRest(offset);

	1998 SaveBMInfo(bm, not_at_start, offset);

	1999 return;

	2000 }

	2001 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start);

	2002 SaveBMInfo(bm, not_at_start, offset);

	2003 }

	2004

	2005

	2006 void TextNode::FillInBMInfo(intptr_t initial_offset,

	2007 intptr_t budget,

	2008 BoyerMooreLookahead* bm,

	2009 bool not_at_start) {

	2010 if (initial_offset >= bm->length()) return;

	2011 intptr_t offset = initial_offset;

	2012 intptr_t max_char = bm->max_char();

	2013 for (intptr_t i = 0; i < elements()->length(); i++) {

	2014 if (offset >= bm->length()) {

	2015 if (initial_offset == 0) set_bm_info(not_at_start, bm);

	2016 return;

	2017 }

	2018 TextElement text = elements()->At(i);

	2019 if (text.text_type() == TextElement::ATOM) {

	2020 RegExpAtom* atom = text.atom();

	2021 for (intptr_t j = 0; j < atom->length(); j++, offset++) {

	2022 if (offset >= bm->length()) {

	2023 if (initial_offset == 0) set_bm_info(not_at_start, bm);

	2024 return;

	2025 }

	2026 uint16_t character = atom->data()->At(j);

	2027 if (bm->compiler()->ignore_case()) {

	2028 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	2029 intptr_t length = GetCaseIndependentLetters(

	2030 character,

	2031 bm->max_char() == Symbols::kMaxOneCharCodeSymbol,

	2032 chars);

	2033 for (intptr_t j = 0; j < length; j++) {

	2034 bm->Set(offset, chars[j]);

	2035 }

	2036 } else {

	2037 if (character <= max_char) bm->Set(offset, character);

	2038 }

	2039 }

	2040 } else {

	2041 ASSERT(text.text_type() == TextElement::CHAR_CLASS);

	2042 RegExpCharacterClass* char_class = text.char_class();

	2043 ZoneGrowableArray<CharacterRange>* ranges = char_class->ranges();

	2044 if (char_class->is_negated()) {

	2045 bm->SetAll(offset);

	2046 } else {

	2047 for (intptr_t k = 0; k < ranges->length(); k++) {

	2048 CharacterRange& range = (*ranges)[k];

	2049 if (range.from() > max_char) continue;

	2050 intptr_t to = Utils::Minimum(max_char,

	2051 static_cast<intptr_t>(range.to()));

	2052 bm->SetInterval(offset, Interval(range.from(), to));

	2053 }

	2054 }

	2055 offset++;

	2056 }

	2057 }

	2058 if (offset >= bm->length()) {

	2059 if (initial_offset == 0) set_bm_info(not_at_start, bm);

	2060 return;

	2061 }

	2062 on_success()->FillInBMInfo(offset,

	2063 budget - 1,

	2064 bm,

	2065 true); // Not at start after a text node.

	2066 if (initial_offset == 0) set_bm_info(not_at_start, bm);

	2067 }

	2068

	2069

	2070 // Check for [0-9A-Z_a-z].

	2071 static void EmitWordCheck(RegExpMacroAssembler* assembler,

	2072 BlockLabel* word,

	2073 BlockLabel* non_word,

	2074 bool fall_through_on_word) {

	2075 if (assembler->CheckSpecialCharacterClass(

	2076 fall_through_on_word ? 'w' : 'W',

	2077 fall_through_on_word ? non_word : word)) {

	2078 // Optimized implementation available.

	2079 return;

	2080 }

	2081 assembler->CheckCharacterGT('z', non_word);

	2082 assembler->CheckCharacterLT('0', non_word);

	2083 assembler->CheckCharacterGT('a' - 1, word);

	2084 assembler->CheckCharacterLT('9' + 1, word);

	2085 assembler->CheckCharacterLT('A', non_word);

	2086 assembler->CheckCharacterLT('Z' + 1, word);

	2087 if (fall_through_on_word) {

	2088 assembler->CheckNotCharacter('_', non_word);

	2089 } else {

	2090 assembler->CheckCharacter('_', word);

	2091 }

	2092 }

737	2093

738	2094

739 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).	2095 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).

740 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {	2096 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {

741 UNIMPLEMENTED();	2097 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	2098 Trace::TriBool next_is_word_character = Trace::UNKNOWN;

	2099 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);

	2100 BoyerMooreLookahead* lookahead = bm_info(not_at_start);

	2101 if (lookahead == NULL) {

	2102 intptr_t eats_at_least =

	2103 Utils::Minimum(kMaxLookaheadForBoyerMoore,

	2104 EatsAtLeast(kMaxLookaheadForBoyerMoore,

	2105 kRecursionBudget,

	2106 not_at_start));

	2107 if (eats_at_least >= 1) {

	2108 BoyerMooreLookahead* bm =

	2109 new(I) BoyerMooreLookahead(eats_at_least, compiler, I);

	2110 FillInBMInfo(0, kRecursionBudget, bm, not_at_start);

	2111 if (bm->at(0)->is_non_word())

	2112 next_is_word_character = Trace::FALSE_VALUE;

	2113 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;

	2114 }

	2115 } else {

	2116 if (lookahead->at(0)->is_non_word())

	2117 next_is_word_character = Trace::FALSE_VALUE;

	2118 if (lookahead->at(0)->is_word())

	2119 next_is_word_character = Trace::TRUE_VALUE;

	2120 }

	2121 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);

	2122 if (next_is_word_character == Trace::UNKNOWN) {

	2123 BlockLabel before_non_word;

	2124 BlockLabel before_word;

	2125 if (trace->characters_preloaded() != 1) {

	2126 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);

	2127 }

	2128 // Fall through on non-word.

	2129 EmitWordCheck(assembler, &before_word, &before_non_word, false);

	2130 // Next character is not a word character.

	2131 assembler->BindBlock(&before_non_word);

	2132 BlockLabel ok;

	2133 // Backtrack on \B (non-boundary check) if previous is a word,

	2134 // since we know next is not a word and this would be a boundary.

	2135 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);

	2136

	2137 if (!assembler->IsClosed()) {

	2138 assembler->GoTo(&ok);

	2139 }

	2140

	2141 assembler->BindBlock(&before_word);

	2142 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);

	2143 assembler->BindBlock(&ok);

	2144 } else if (next_is_word_character == Trace::TRUE_VALUE) {

	2145 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);

	2146 } else {

	2147 ASSERT(next_is_word_character == Trace::FALSE_VALUE);

	2148 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);

	2149 }

742 }	2150 }

743	2151

744	2152

745 void AssertionNode::BacktrackIfPrevious(	2153 void AssertionNode::BacktrackIfPrevious(

746 RegExpCompiler* compiler,	2154 RegExpCompiler* compiler,

747 Trace* trace,	2155 Trace* trace,

748 AssertionNode::IfPrevious backtrack_if_previous) {	2156 AssertionNode::IfPrevious backtrack_if_previous) {

749 UNIMPLEMENTED();	2157 RegExpMacroAssembler* assembler = compiler->macro_assembler();

750 }	2158 Trace new_trace(*trace);

751	2159 new_trace.InvalidateCurrentCharacter();

752	2160

	2161 BlockLabel fall_through, dummy;

	2162

	2163 BlockLabel* non_word = backtrack_if_previous == kIsNonWord ?

	2164 new_trace.backtrack() :

	2165 &fall_through;

	2166 BlockLabel* word = backtrack_if_previous == kIsNonWord ?

	2167 &fall_through :

	2168 new_trace.backtrack();

	2169

	2170 if (new_trace.cp_offset() == 0) {

	2171 // The start of input counts as a non-word character, so the question is

	2172 // decided if we are at the start.

	2173 assembler->CheckAtStart(non_word);

	2174 }

	2175 // We already checked that we are not at the start of input so it must be

	2176 // OK to load the previous character.

	2177 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);

	2178 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);

	2179

	2180 assembler->BindBlock(&fall_through);

	2181 on_success()->Emit(compiler, &new_trace);

	2182 }

	2183

	2184

	2185 static bool DeterminedAlready(QuickCheckDetails* quick_check, intptr_t offset) {

	2186 if (quick_check == NULL) return false;

	2187 if (offset >= quick_check->characters()) return false;

	2188 return quick_check->positions(offset)->determines_perfectly;

	2189 }

	2190

	2191

	2192 static void UpdateBoundsCheck(intptr_t index, intptr_t* checked_up_to) {

	2193 if (index > *checked_up_to) {

	2194 *checked_up_to = index;

	2195 }

	2196 }

	2197

	2198

	2199 typedef bool EmitCharacterFunction(Isolate* isolate,

	2200 RegExpCompiler* compiler,

	2201 uint16_t c,

	2202 BlockLabel* on_failure,

	2203 intptr_t cp_offset,

	2204 bool check,

	2205 bool preloaded);

	2206

	2207

	2208 static inline bool EmitSimpleCharacter(Isolate* isolate,

	2209 RegExpCompiler* compiler,

	2210 uint16_t c,

	2211 BlockLabel* on_failure,

	2212 intptr_t cp_offset,

	2213 bool check,

	2214 bool preloaded) {

	2215 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	2216 bool bound_checked = false;

	2217 if (!preloaded) {

	2218 assembler->LoadCurrentCharacter(

	2219 cp_offset,

	2220 on_failure,

	2221 check);

	2222 bound_checked = true;

	2223 }

	2224 assembler->CheckNotCharacter(c, on_failure);

	2225 return bound_checked;

	2226 }

	2227

	2228

	2229 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,

	2230 bool ascii,

	2231 uint16_t c1,

	2232 uint16_t c2,

	2233 BlockLabel* on_failure) {

	2234 uint16_t char_mask;

	2235 if (ascii) {

	2236 char_mask = Symbols::kMaxOneCharCodeSymbol;

	2237 } else {

	2238 char_mask = Utf16::kMaxCodeUnit;

	2239 }

	2240 uint16_t exor = c1 ^ c2;

	2241 // Check whether exor has only one bit set.

	2242 if (((exor - 1) & exor) == 0) {

	2243 // If c1 and c2 differ only by one bit.

	2244 // Ecma262UnCanonicalize always gives the highest number last.

	2245 ASSERT(c2 > c1);

	2246 uint16_t mask = char_mask ^ exor;

	2247 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);

	2248 return true;

	2249 }

	2250 ASSERT(c2 > c1);

	2251 uint16_t diff = c2 - c1;

	2252 if (((diff - 1) & diff) == 0 && c1 >= diff) {

	2253 // If the characters differ by 2^n but don't differ by one bit then

	2254 // subtract the difference from the found character, then do the or

	2255 // trick. We avoid the theoretical case where negative numbers are

	2256 // involved in order to simplify code generation.

	2257 uint16_t mask = char_mask ^ diff;

	2258 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,

	2259 diff,

	2260 mask,

	2261 on_failure);

	2262 return true;

	2263 }

	2264 return false;

	2265 }

	2266

	2267 // Only emits letters (things that have case). Only used for case independent

	2268 // matches.

	2269 static inline bool EmitAtomLetter(Isolate* isolate,

	2270 RegExpCompiler* compiler,

	2271 uint16_t c,

	2272 BlockLabel* on_failure,

	2273 intptr_t cp_offset,

	2274 bool check,

	2275 bool preloaded) {

	2276 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	2277 bool ascii = compiler->ascii();

	2278 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	2279 intptr_t length = GetCaseIndependentLetters(c, ascii, chars);

	2280 if (length <= 1) return false;

	2281 // We may not need to check against the end of the input string

	2282 // if this character lies before a character that matched.

	2283 if (!preloaded) {

	2284 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

	2285 }

	2286 BlockLabel ok;

	2287 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);

	2288 switch (length) {

	2289 case 2: {

	2290 if (ShortCutEmitCharacterPair(macro_assembler,

	2291 ascii,

	2292 chars[0],

	2293 chars[1],

	2294 on_failure)) {

	2295 } else {

	2296 macro_assembler->CheckCharacter(chars[0], &ok);

	2297 macro_assembler->CheckNotCharacter(chars[1], on_failure);

	2298 macro_assembler->BindBlock(&ok);

	2299 }

	2300 break;

	2301 }

	2302 case 4:

	2303 macro_assembler->CheckCharacter(chars[3], &ok);

	2304 // Fall through!

	2305 case 3:

	2306 macro_assembler->CheckCharacter(chars[0], &ok);

	2307 macro_assembler->CheckCharacter(chars[1], &ok);

	2308 macro_assembler->CheckNotCharacter(chars[2], on_failure);

	2309 macro_assembler->BindBlock(&ok);

	2310 break;

	2311 default:

	2312 UNREACHABLE();

	2313 break;

	2314 }

	2315 return true;

	2316 }

	2317

	2318

	2319 // Only emits non-letters (things that don't have case). Only used for case

	2320 // independent matches.

	2321 static inline bool EmitAtomNonLetter(Isolate* isolate,

	2322 RegExpCompiler* compiler,

	2323 uint16_t c,

	2324 BlockLabel* on_failure,

	2325 intptr_t cp_offset,

	2326 bool check,

	2327 bool preloaded) {

	2328 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	2329 bool ascii = compiler->ascii();

	2330 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	2331 intptr_t length = GetCaseIndependentLetters(c, ascii, chars);

	2332 if (length < 1) {

	2333 // This can't match. Must be an ASCII subject and a non-ASCII character.

	2334 // We do not need to do anything since the ASCII pass already handled this.

	2335 return false; // Bounds not checked.

	2336 }

	2337 bool checked = false;

	2338 // We handle the length > 1 case in a later pass.

	2339 if (length == 1) {

	2340 if (ascii && c > Symbols::kMaxOneCharCodeSymbol) {

	2341 // Can't match - see above.

	2342 return false; // Bounds not checked.

	2343 }

	2344 if (!preloaded) {

	2345 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

	2346 checked = check;

	2347 }

	2348 macro_assembler->CheckNotCharacter(c, on_failure);

	2349 }

	2350 return checked;

	2351 }

	2352

	2353

	2354 static void EmitBoundaryTest(RegExpMacroAssembler* masm,

	2355 intptr_t border,

	2356 BlockLabel* fall_through,

	2357 BlockLabel* above_or_equal,

	2358 BlockLabel* below) {

	2359 if (below != fall_through) {

	2360 masm->CheckCharacterLT(border, below);

	2361 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);

	2362 } else {

	2363 masm->CheckCharacterGT(border - 1, above_or_equal);

	2364 }

	2365 }

	2366

	2367

	2368 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,

	2369 intptr_t first,

	2370 intptr_t last,

	2371 BlockLabel* fall_through,

	2372 BlockLabel* in_range,

	2373 BlockLabel* out_of_range) {

	2374 if (in_range == fall_through) {

	2375 if (first == last) {

	2376 masm->CheckNotCharacter(first, out_of_range);

	2377 } else {

	2378 masm->CheckCharacterNotInRange(first, last, out_of_range);

	2379 }

	2380 } else {

	2381 if (first == last) {

	2382 masm->CheckCharacter(first, in_range);

	2383 } else {

	2384 masm->CheckCharacterInRange(first, last, in_range);

	2385 }

	2386 if (out_of_range != fall_through) masm->GoTo(out_of_range);

	2387 }

	2388 }

	2389

	2390

	2391 static void CutOutRange(RegExpMacroAssembler* masm,

	2392 ZoneGrowableArray<int>* ranges,

	2393 intptr_t start_index,

	2394 intptr_t end_index,

	2395 intptr_t cut_index,

	2396 BlockLabel* even_label,

	2397 BlockLabel* odd_label) {

	2398 bool odd = (((cut_index - start_index) & 1) == 1);

	2399 BlockLabel* in_range_label = odd ? odd_label : even_label;

	2400 BlockLabel dummy;

	2401 EmitDoubleBoundaryTest(masm,

	2402 ranges->At(cut_index),

	2403 ranges->At(cut_index + 1) - 1,

	2404 &dummy,

	2405 in_range_label,

	2406 &dummy);

	2407 ASSERT(!dummy.IsLinked());

	2408 // Cut out the single range by rewriting the array. This creates a new

	2409 // range that is a merger of the two ranges on either side of the one we

	2410 // are cutting out. The oddity of the labels is preserved.

	2411 for (intptr_t j = cut_index; j > start_index; j--) {

	2412 (*ranges)[j] = ranges->At(j - 1);

	2413 }

	2414 for (intptr_t j = cut_index + 1; j < end_index; j++) {

	2415 (*ranges)[j] = ranges->At(j + 1);

	2416 }

	2417 }

	2418

	2419

	2420 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.

	2421 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.

	2422 static void EmitUseLookupTable(

	2423 RegExpMacroAssembler* masm,

	2424 ZoneGrowableArray<int>* ranges,

	2425 intptr_t start_index,

	2426 intptr_t end_index,

	2427 intptr_t min_char,

	2428 BlockLabel* fall_through,

	2429 BlockLabel* even_label,

	2430 BlockLabel* odd_label) {

	2431 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;

	2432 static const intptr_t kMask = RegExpMacroAssembler::kTableMask;

	2433

	2434 intptr_t base = (min_char & ~kMask);

	2435

	2436 // Assert that everything is on one kTableSize page.

	2437 for (intptr_t i = start_index; i <= end_index; i++) {

	2438 ASSERT((ranges->At(i) & ~kMask) == base);

	2439 }

	2440 ASSERT(start_index == 0 \|\| (ranges->At(start_index - 1) & ~kMask) <= base);

	2441

	2442 char templ[kSize];

	2443 BlockLabel* on_bit_set;

	2444 BlockLabel* on_bit_clear;

	2445 intptr_t bit;

	2446 if (even_label == fall_through) {

	2447 on_bit_set = odd_label;

	2448 on_bit_clear = even_label;

	2449 bit = 1;

	2450 } else {

	2451 on_bit_set = even_label;

	2452 on_bit_clear = odd_label;

	2453 bit = 0;

	2454 }

	2455 for (intptr_t i = 0; i < (ranges->At(start_index) & kMask) && i < kSize;

	2456 i++) {

	2457 templ[i] = bit;

	2458 }

	2459 intptr_t j = 0;

	2460 bit ^= 1;

	2461 for (intptr_t i = start_index; i < end_index; i++) {

	2462 for (j = (ranges->At(i) & kMask); j < (ranges->At(i + 1) & kMask); j++) {

	2463 templ[j] = bit;

	2464 }

	2465 bit ^= 1;

	2466 }

	2467 for (intptr_t i = j; i < kSize; i++) {

	2468 templ[i] = bit;

	2469 }

	2470 // TODO(erikcorry): Cache these.

	2471 TypedData& ba = TypedData::ZoneHandle(
	Florian Schneider 2014/10/01 17:04:14 const TypedData& const TypedData& jgruber1 2014/10/03 18:59:52 Done. Show quoted text On 2014/10/01 17:04:14, Florian Schneider wrote: > const TypedData& Done.
	2472 masm->isolate(),

	2473 TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));

	2474 for (intptr_t i = 0; i < kSize; i++) {

	2475 ba.SetUint8(i, templ[i]);

	2476 }

	2477 masm->CheckBitInTable(ba, on_bit_set);

	2478 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);

	2479 }

	2480

	2481

	2482 // Unicode case. Split the search space into kSize spaces that are handled

	2483 // with recursion.

	2484 static void SplitSearchSpace(ZoneGrowableArray<int>* ranges,

	2485 intptr_t start_index,

	2486 intptr_t end_index,

	2487 intptr_t* new_start_index,

	2488 intptr_t* new_end_index,

	2489 intptr_t* border) {

	2490 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;

	2491 static const intptr_t kMask = RegExpMacroAssembler::kTableMask;

	2492

	2493 intptr_t first = ranges->At(start_index);

	2494 intptr_t last = ranges->At(end_index) - 1;

	2495

	2496 *new_start_index = start_index;

	2497 *border = (ranges->At(start_index) & ~kMask) + kSize;

	2498 while (*new_start_index < end_index) {

	2499 if (ranges->At(new_start_index) > border) break;

	2500 (*new_start_index)++;

	2501 }

	2502 // new_start_index is the index of the first edge that is beyond the

	2503 // current kSize space.

	2504

	2505 // For very large search spaces we do a binary chop search of the non-ASCII

	2506 // space instead of just going to the end of the current kSize space. The

	2507 // heuristics are complicated a little by the fact that any 128-character

	2508 // encoding space can be quickly tested with a table lookup, so we don't

	2509 // wish to do binary chop search at a smaller granularity than that. A

	2510 // 128-character space can take up a lot of space in the ranges array if,

	2511 // for example, we only want to match every second character (eg. the lower

	2512 // case characters on some Unicode pages).

	2513 intptr_t binary_chop_index = (end_index + start_index) / 2;

	2514 // The first test ensures that we get to the code that handles the ASCII

	2515 // range with a single not-taken branch, speeding up this important

	2516 // character range (even non-ASCII charset-based text has spaces and

	2517 // punctuation).

	2518 if (*border - 1 > Symbols::kMaxOneCharCodeSymbol && // ASCII case.

	2519 end_index - start_index > (new_start_index - start_index) 2 &&

	2520 last - first > kSize * 2 &&

	2521 binary_chop_index > *new_start_index &&

	2522 ranges->At(binary_chop_index) >= first + 2 * kSize) {

	2523 intptr_t scan_forward_for_section_border = binary_chop_index;;

	2524 intptr_t new_border = (ranges->At(binary_chop_index) \| kMask) + 1;

	2525

	2526 while (scan_forward_for_section_border < end_index) {

	2527 if (ranges->At(scan_forward_for_section_border) > new_border) {

	2528 *new_start_index = scan_forward_for_section_border;

	2529 *border = new_border;

	2530 break;

	2531 }

	2532 scan_forward_for_section_border++;

	2533 }

	2534 }

	2535

	2536 ASSERT(*new_start_index > start_index);

	2537 new_end_index = new_start_index - 1;

	2538 if (ranges->At(new_end_index) == border) {

	2539 (*new_end_index)--;

	2540 }

	2541 if (*border >= ranges->At(end_index)) {

	2542 *border = ranges->At(end_index);

	2543 *new_start_index = end_index; // Won't be used.

	2544 *new_end_index = end_index - 1;

	2545 }

	2546 }

	2547

	2548

	2549 // Gets a series of segment boundaries representing a character class. If the

	2550 // character is in the range between an even and an odd boundary (counting from

	2551 // start_index) then go to even_label, otherwise go to odd_label. We already

	2552 // know that the character is in the range of min_char to max_char inclusive.

	2553 // Either label can be NULL indicating backtracking. Either label can also be

	2554 // equal to the fall_through label.

	2555 static void GenerateBranches(RegExpMacroAssembler* masm,

	2556 ZoneGrowableArray<int>* ranges,

	2557 intptr_t start_index,

	2558 intptr_t end_index,

	2559 uint16_t min_char,

	2560 uint16_t max_char,

	2561 BlockLabel* fall_through,

	2562 BlockLabel* even_label,

	2563 BlockLabel* odd_label) {

	2564 intptr_t first = ranges->At(start_index);

	2565 intptr_t last = ranges->At(end_index) - 1;

	2566

	2567 ASSERT(min_char < first);

	2568

	2569 // Just need to test if the character is before or on-or-after

	2570 // a particular character.

	2571 if (start_index == end_index) {

	2572 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);

	2573 return;

	2574 }

	2575

	2576 // Another almost trivial case: There is one interval in the middle that is

	2577 // different from the end intervals.

	2578 if (start_index + 1 == end_index) {

	2579 EmitDoubleBoundaryTest(

	2580 masm, first, last, fall_through, even_label, odd_label);

	2581 return;

	2582 }

	2583

	2584 // It's not worth using table lookup if there are very few intervals in the

	2585 // character class.

	2586 if (end_index - start_index <= 6) {

	2587 // It is faster to test for individual characters, so we look for those

	2588 // first, then try arbitrary ranges in the second round.

	2589 static intptr_t kNoCutIndex = -1;

	2590 intptr_t cut = kNoCutIndex;

	2591 for (intptr_t i = start_index; i < end_index; i++) {

	2592 if (ranges->At(i) == ranges->At(i + 1) - 1) {

	2593 cut = i;

	2594 break;

	2595 }

	2596 }

	2597 if (cut == kNoCutIndex) cut = start_index;

	2598 CutOutRange(

	2599 masm, ranges, start_index, end_index, cut, even_label, odd_label);

	2600 ASSERT(end_index - start_index >= 2);

	2601 GenerateBranches(masm,

	2602 ranges,

	2603 start_index + 1,

	2604 end_index - 1,

	2605 min_char,

	2606 max_char,

	2607 fall_through,

	2608 even_label,

	2609 odd_label);

	2610 return;

	2611 }

	2612

	2613 // If there are a lot of intervals in the regexp, then we will use tables to

	2614 // determine whether the character is inside or outside the character class.

	2615 static const intptr_t kBits = RegExpMacroAssembler::kTableSizeBits;

	2616

	2617 if ((max_char >> kBits) == (min_char >> kBits)) {

	2618 EmitUseLookupTable(masm,

	2619 ranges,

	2620 start_index,

	2621 end_index,

	2622 min_char,

	2623 fall_through,

	2624 even_label,

	2625 odd_label);

	2626 return;

	2627 }

	2628

	2629 if ((min_char >> kBits) != (first >> kBits)) {

	2630 masm->CheckCharacterLT(first, odd_label);

	2631 GenerateBranches(masm,

	2632 ranges,

	2633 start_index + 1,

	2634 end_index,

	2635 first,

	2636 max_char,

	2637 fall_through,

	2638 odd_label,

	2639 even_label);

	2640 return;

	2641 }

	2642

	2643 intptr_t new_start_index = 0;

	2644 intptr_t new_end_index = 0;

	2645 intptr_t border = 0;

	2646

	2647 SplitSearchSpace(ranges,

	2648 start_index,

	2649 end_index,

	2650 &new_start_index,

	2651 &new_end_index,

	2652 &border);

	2653

	2654 BlockLabel handle_rest;

	2655 BlockLabel* above = &handle_rest;

	2656 if (border == last + 1) {

	2657 // We didn't find any section that started after the limit, so everything

	2658 // above the border is one of the terminal labels.

	2659 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;

	2660 ASSERT(new_end_index == end_index - 1);

	2661 }

	2662

	2663 ASSERT(start_index <= new_end_index);

	2664 ASSERT(new_start_index <= end_index);

	2665 ASSERT(start_index < new_start_index);

	2666 ASSERT(new_end_index < end_index);

	2667 ASSERT(new_end_index + 1 == new_start_index \|\|

	2668 (new_end_index + 2 == new_start_index &&

	2669 border == ranges->At(new_end_index + 1)));

	2670 ASSERT(min_char < border - 1);

	2671 ASSERT(border < max_char);

	2672 ASSERT(ranges->At(new_end_index) < border);

	2673 ASSERT(border < ranges->At(new_start_index) \|\|

	2674 (border == ranges->At(new_start_index) &&

	2675 new_start_index == end_index &&

	2676 new_end_index == end_index - 1 &&

	2677 border == last + 1));

	2678 ASSERT(new_start_index == 0 \|\| border >= ranges->At(new_start_index - 1));

	2679

	2680 masm->CheckCharacterGT(border - 1, above);

	2681 BlockLabel dummy;

	2682 GenerateBranches(masm,

	2683 ranges,

	2684 start_index,

	2685 new_end_index,

	2686 min_char,

	2687 border - 1,

	2688 &dummy,

	2689 even_label,

	2690 odd_label);

	2691

	2692 if (handle_rest.IsLinked()) {

	2693 masm->BindBlock(&handle_rest);

	2694 bool flip = (new_start_index & 1) != (start_index & 1);

	2695 GenerateBranches(masm,

	2696 ranges,

	2697 new_start_index,

	2698 end_index,

	2699 border,

	2700 max_char,

	2701 &dummy,

	2702 flip ? odd_label : even_label,

	2703 flip ? even_label : odd_label);

	2704 }

	2705 }

	2706

	2707

	2708 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,

	2709 RegExpCharacterClass* cc,

	2710 bool ascii,

	2711 BlockLabel* on_failure,

	2712 intptr_t cp_offset,

	2713 bool check_offset,

	2714 bool preloaded,

	2715 Isolate* isolate) {

	2716 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();

	2717 if (!CharacterRange::IsCanonical(ranges)) {

	2718 CharacterRange::Canonicalize(ranges);

	2719 }

	2720

	2721 intptr_t max_char;

	2722 if (ascii) {

	2723 max_char = Symbols::kMaxOneCharCodeSymbol;

	2724 } else {

	2725 max_char = Utf16::kMaxCodeUnit;

	2726 }

	2727

	2728 intptr_t range_count = ranges->length();

	2729

	2730 intptr_t last_valid_range = range_count - 1;

	2731 while (last_valid_range >= 0) {

	2732 CharacterRange& range = (*ranges)[last_valid_range];

	2733 if (range.from() <= max_char) {

	2734 break;

	2735 }

	2736 last_valid_range--;

	2737 }

	2738

	2739 if (last_valid_range < 0) {

	2740 if (!cc->is_negated()) {

	2741 macro_assembler->GoTo(on_failure);

	2742 }

	2743 if (check_offset) {

	2744 macro_assembler->CheckPosition(cp_offset, on_failure);

	2745 }

	2746 return;

	2747 }

	2748

	2749 if (last_valid_range == 0 &&

	2750 ranges->At(0).IsEverything(max_char)) {

	2751 if (cc->is_negated()) {

	2752 macro_assembler->GoTo(on_failure);

	2753 } else {

	2754 // This is a common case hit by non-anchored expressions.

	2755 if (check_offset) {

	2756 macro_assembler->CheckPosition(cp_offset, on_failure);

	2757 }

	2758 }

	2759 return;

	2760 }

	2761 if (last_valid_range == 0 &&

	2762 !cc->is_negated() &&

	2763 ranges->At(0).IsEverything(max_char)) {

	2764 // This is a common case hit by non-anchored expressions.

	2765 if (check_offset) {

	2766 macro_assembler->CheckPosition(cp_offset, on_failure);

	2767 }

	2768 return;

	2769 }

	2770

	2771 if (!preloaded) {

	2772 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);

	2773 }

	2774

	2775 if (cc->is_standard() &&

	2776 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),

	2777 on_failure)) {

	2778 return;

	2779 }

	2780

	2781

	2782 // A new list with ascending entries. Each entry is a code unit

	2783 // where there is a boundary between code units that are part of

	2784 // the class and code units that are not. Normally we insert an

	2785 // entry at zero which goes to the failure label, but if there

	2786 // was already one there we fall through for success on that entry.

	2787 // Subsequent entries have alternating meaning (success/failure).

	2788 ZoneGrowableArray<int>* range_boundaries =

	2789 new(isolate) ZoneGrowableArray<int>(last_valid_range);

	2790

	2791 bool zeroth_entry_is_failure = !cc->is_negated();

	2792

	2793 for (intptr_t i = 0; i <= last_valid_range; i++) {

	2794 CharacterRange& range = (*ranges)[i];

	2795 if (range.from() == 0) {

	2796 ASSERT(i == 0);

	2797 zeroth_entry_is_failure = !zeroth_entry_is_failure;

	2798 } else {

	2799 range_boundaries->Add(range.from());

	2800 }

	2801 range_boundaries->Add(range.to() + 1);

	2802 }

	2803 intptr_t end_index = range_boundaries->length() - 1;

	2804 if (range_boundaries->At(end_index) > max_char) {

	2805 end_index--;

	2806 }

	2807

	2808 BlockLabel fall_through;

	2809 GenerateBranches(macro_assembler,

	2810 range_boundaries,

	2811 0, // start_index.

	2812 end_index,

	2813 0, // min_char.

	2814 max_char,

	2815 &fall_through,

	2816 zeroth_entry_is_failure ? &fall_through : on_failure,

	2817 zeroth_entry_is_failure ? on_failure : &fall_through);

	2818 macro_assembler->BindBlock(&fall_through);

	2819 }

	2820

	2821

753 // We call this repeatedly to generate code for each pass over the text node.	2822 // We call this repeatedly to generate code for each pass over the text node.

754 // The passes are in increasing order of difficulty because we hope one	2823 // The passes are in increasing order of difficulty because we hope one

755 // of the first passes will fail in which case we are saved the work of the	2824 // of the first passes will fail in which case we are saved the work of the

756 // later passes. for example for the case independent regexp /%[asdfghjkl]a/	2825 // later passes. for example for the case independent regexp /%[asdfghjkl]a/

757 // we will check the '%' in the first pass, the case independent 'a' in the	2826 // we will check the '%' in the first pass, the case independent 'a' in the

758 // second pass and the character class in the last pass.	2827 // second pass and the character class in the last pass.

759 //	2828 //

760 // The passes are done from right to left, so for example to test for /bar/	2829 // The passes are done from right to left, so for example to test for /bar/

761 // we will first test for an 'r' with offset 2, then an 'a' with offset 1	2830 // we will first test for an 'r' with offset 2, then an 'a' with offset 1

762 // and then a 'b' with offset 0. This means we can avoid the end-of-input	2831 // and then a 'b' with offset 0. This means we can avoid the end-of-input

(...skipping 15 matching lines...) Expand all Loading...
778 // loading characters, which means we do not need to recheck the bounds	2847 // loading characters, which means we do not need to recheck the bounds

779 // up to the limit the quick check already checked. In addition the quick	2848 // up to the limit the quick check already checked. In addition the quick

780 // check can have involved a mask and compare operation which may simplify	2849 // check can have involved a mask and compare operation which may simplify

781 // or obviate the need for further checks at some character positions.	2850 // or obviate the need for further checks at some character positions.

782 void TextNode::TextEmitPass(RegExpCompiler* compiler,	2851 void TextNode::TextEmitPass(RegExpCompiler* compiler,

783 TextEmitPassType pass,	2852 TextEmitPassType pass,

784 bool preloaded,	2853 bool preloaded,

785 Trace* trace,	2854 Trace* trace,

786 bool first_element_checked,	2855 bool first_element_checked,

787 intptr_t* checked_up_to) {	2856 intptr_t* checked_up_to) {

788 UNIMPLEMENTED();	2857 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	2858 bool ascii = compiler->ascii();

	2859 BlockLabel* backtrack = trace->backtrack();

	2860 QuickCheckDetails* quick_check = trace->quick_check_performed();

	2861 intptr_t element_count = elms_->length();

	2862 for (intptr_t i = preloaded ? 0 : element_count - 1; i >= 0; i--) {

	2863 TextElement elm = elms_->At(i);

	2864 intptr_t cp_offset = trace->cp_offset() + elm.cp_offset();

	2865 if (elm.text_type() == TextElement::ATOM) {

	2866 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();

	2867 for (intptr_t j = preloaded ? 0 : quarks->length() - 1; j >= 0; j--) {

	2868 if (first_element_checked && i == 0 && j == 0) continue;

	2869 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;

	2870 EmitCharacterFunction* emit_function = NULL;

	2871 switch (pass) {

	2872 case NON_ASCII_MATCH:

	2873 ASSERT(ascii);

	2874 if (quarks->At(j) > Symbols::kMaxOneCharCodeSymbol) {

	2875 assembler->GoTo(backtrack);

	2876 return;

	2877 }

	2878 break;

	2879 case NON_LETTER_CHARACTER_MATCH:

	2880 emit_function = &EmitAtomNonLetter;

	2881 break;

	2882 case SIMPLE_CHARACTER_MATCH:

	2883 emit_function = &EmitSimpleCharacter;

	2884 break;

	2885 case CASE_CHARACTER_MATCH:

	2886 emit_function = &EmitAtomLetter;

	2887 break;

	2888 default:

	2889 break;

	2890 }

	2891 if (emit_function != NULL) {

	2892 bool bound_checked = emit_function(I,

	2893 compiler,

	2894 quarks->At(j),

	2895 backtrack,

	2896 cp_offset + j,

	2897 *checked_up_to < cp_offset + j,

	2898 preloaded);

	2899 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);

	2900 }

	2901 }

	2902 } else {

	2903 ASSERT(elm.text_type() == TextElement::CHAR_CLASS);

	2904 if (pass == CHARACTER_CLASS_MATCH) {

	2905 if (first_element_checked && i == 0) continue;

	2906 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;

	2907 RegExpCharacterClass* cc = elm.char_class();

	2908 EmitCharClass(assembler,

	2909 cc,

	2910 ascii,

	2911 backtrack,

	2912 cp_offset,

	2913 *checked_up_to < cp_offset,

	2914 preloaded,

	2915 I);

	2916 UpdateBoundsCheck(cp_offset, checked_up_to);

	2917 }

	2918 }

	2919 }

789 }	2920 }

790	2921

791	2922

792 intptr_t ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,	2923 intptr_t ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,

793 intptr_t eats_at_least) {	2924 intptr_t eats_at_least) {

794 UNIMPLEMENTED();	2925 intptr_t preload_characters = Utils::Minimum(static_cast<intptr_t>(4),

795 return NULL;	2926 eats_at_least);

	2927 if (compiler->macro_assembler()->CanReadUnaligned()) {

	2928 bool ascii = compiler->ascii();

	2929 if (ascii) {

	2930 if (preload_characters > 4) preload_characters = 4;

	2931 // We can't preload 3 characters because there is no machine instruction

	2932 // to do that. We can't just load 4 because we could be reading

	2933 // beyond the end of the string, which could cause a memory fault.

	2934 if (preload_characters == 3) preload_characters = 2;

	2935 } else {

	2936 if (preload_characters > 2) preload_characters = 2;

	2937 }

	2938 } else {

	2939 if (preload_characters > 1) preload_characters = 1;

	2940 }

	2941 return preload_characters;

796 }	2942 }

797	2943

798	2944

799 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,

800 Trace* trace,

801 GuardedAlternative alternative,

802 AlternativeGeneration* alt_gen,

803 intptr_t preload_characters,

804 bool next_expects_preload) {

805 UNIMPLEMENTED();

806 }

807

808

809 intptr_t TextNode::EatsAtLeast(intptr_t still_to_find,	2945 intptr_t TextNode::EatsAtLeast(intptr_t still_to_find,

810 intptr_t budget,	2946 intptr_t budget,

811 bool not_at_start) {	2947 bool not_at_start) {

812 intptr_t answer = Length();	2948 intptr_t answer = Length();

813 if (answer >= still_to_find) return answer;	2949 if (answer >= still_to_find) return answer;

814 if (budget <= 0) return answer;	2950 if (budget <= 0) return answer;

815 // We are not at start after this node so we set the last argument to 'true'.	2951 // We are not at start after this node so we set the last argument to 'true'.

816 return answer + on_success()->EatsAtLeast(still_to_find - answer,	2952 return answer + on_success()->EatsAtLeast(still_to_find - answer,

817 budget - 1,	2953 budget - 1,

818 true);	2954 true);

(...skipping 71 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
890 intptr_t budget,	3026 intptr_t budget,

891 bool not_at_start) {	3027 bool not_at_start) {

892 if (budget <= 0) return 0;	3028 if (budget <= 0) return 0;

893 // Alternative 0 is the negative lookahead, alternative 1 is what comes	3029 // Alternative 0 is the negative lookahead, alternative 1 is what comes

894 // afterwards.	3030 // afterwards.

895 RegExpNode* node = (*alternatives_)[1].node();	3031 RegExpNode* node = (*alternatives_)[1].node();

896 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);	3032 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);

897 }	3033 }

898	3034

899	3035

	3036 // Takes the left-most 1-bit and smears it out, setting all bits to its right.

	3037 static inline uint32_t SmearBitsRight(uint32_t v) {

	3038 v \|= v >> 1;

	3039 v \|= v >> 2;

	3040 v \|= v >> 4;

	3041 v \|= v >> 8;

	3042 v \|= v >> 16;

	3043 return v;

	3044 }

	3045

	3046

900 // Here is the meat of GetQuickCheckDetails (see also the comment on the	3047 // Here is the meat of GetQuickCheckDetails (see also the comment on the

901 // super-class in the .h file).	3048 // super-class in the .h file).

902 //	3049 //

903 // We iterate along the text object, building up for each character a	3050 // We iterate along the text object, building up for each character a

904 // mask and value that can be used to test for a quick failure to match.	3051 // mask and value that can be used to test for a quick failure to match.

905 // The masks and values for the positions will be combined into a single	3052 // The masks and values for the positions will be combined into a single

906 // machine word for the current character width in order to be used in	3053 // machine word for the current character width in order to be used in

907 // generating a quick check.	3054 // generating a quick check.

908 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,	3055 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,

909 RegExpCompiler* compiler,	3056 RegExpCompiler* compiler,

910 intptr_t characters_filled_in,	3057 intptr_t characters_filled_in,

911 bool not_at_start) {	3058 bool not_at_start) {

912 UNIMPLEMENTED();	3059 ASSERT(characters_filled_in < details->characters());

	3060 intptr_t characters = details->characters();

	3061 intptr_t char_mask;

	3062 if (compiler->ascii()) {

	3063 char_mask = Symbols::kMaxOneCharCodeSymbol;

	3064 } else {

	3065 char_mask = Utf16::kMaxCodeUnit;

	3066 }

	3067 for (intptr_t k = 0; k < elms_->length(); k++) {

	3068 TextElement elm = elms_->At(k);

	3069 if (elm.text_type() == TextElement::ATOM) {

	3070 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();

	3071 for (intptr_t i = 0; i < characters && i < quarks->length(); i++) {

	3072 QuickCheckDetails::Position* pos =

	3073 details->positions(characters_filled_in);

	3074 uint16_t c = quarks->At(i);

	3075 if (c > char_mask) {

	3076 // If we expect a non-ASCII character from an ASCII string,

	3077 // there is no way we can match. Not even case independent

	3078 // matching can turn an ASCII character into non-ASCII or

	3079 // vice versa.

	3080 details->set_cannot_match();

	3081 pos->determines_perfectly = false;

	3082 return;

	3083 }

	3084 if (compiler->ignore_case()) {

	3085 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	3086 intptr_t length =

	3087 GetCaseIndependentLetters(c, compiler->ascii(), chars);

	3088 ASSERT(length != 0); // Can only happen if c > char_mask (see above).

	3089 if (length == 1) {

	3090 // This letter has no case equivalents, so it's nice and simple

	3091 // and the mask-compare will determine definitely whether we have

	3092 // a match at this character position.

	3093 pos->mask = char_mask;

	3094 pos->value = c;

	3095 pos->determines_perfectly = true;

	3096 } else {

	3097 uint32_t common_bits = char_mask;

	3098 uint32_t bits = chars[0];

	3099 for (intptr_t j = 1; j < length; j++) {

	3100 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);

	3101 common_bits ^= differing_bits;

	3102 bits &= common_bits;

	3103 }

	3104 // If length is 2 and common bits has only one zero in it then

	3105 // our mask and compare instruction will determine definitely

	3106 // whether we have a match at this character position. Otherwise

	3107 // it can only be an approximate check.

	3108 uint32_t one_zero = (common_bits \| ~char_mask);

	3109 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {

	3110 pos->determines_perfectly = true;

	3111 }

	3112 pos->mask = common_bits;

	3113 pos->value = bits;

	3114 }

	3115 } else {

	3116 // Don't ignore case. Nice simple case where the mask-compare will

	3117 // determine definitely whether we have a match at this character

	3118 // position.

	3119 pos->mask = char_mask;

	3120 pos->value = c;

	3121 pos->determines_perfectly = true;

	3122 }

	3123 characters_filled_in++;

	3124 ASSERT(characters_filled_in <= details->characters());

	3125 if (characters_filled_in == details->characters()) {

	3126 return;

	3127 }

	3128 }

	3129 } else {

	3130 QuickCheckDetails::Position* pos =

	3131 details->positions(characters_filled_in);

	3132 RegExpCharacterClass* tree = elm.char_class();

	3133 ZoneGrowableArray<CharacterRange>* ranges = tree->ranges();

	3134 if (tree->is_negated()) {

	3135 // A quick check uses multi-character mask and compare. There is no

	3136 // useful way to incorporate a negative char class into this scheme

	3137 // so we just conservatively create a mask and value that will always

	3138 // succeed.

	3139 pos->mask = 0;

	3140 pos->value = 0;

	3141 } else {

	3142 intptr_t first_range = 0;

	3143 while (ranges->At(first_range).from() > char_mask) {

	3144 first_range++;

	3145 if (first_range == ranges->length()) {

	3146 details->set_cannot_match();

	3147 pos->determines_perfectly = false;

	3148 return;

	3149 }

	3150 }

	3151 CharacterRange range = ranges->At(first_range);

	3152 uint16_t from = range.from();

	3153 uint16_t to = range.to();

	3154 if (to > char_mask) {

	3155 to = char_mask;

	3156 }

	3157 uint32_t differing_bits = (from ^ to);

	3158 // A mask and compare is only perfect if the differing bits form a

	3159 // number like 00011111 with one single block of trailing 1s.

	3160 if ((differing_bits & (differing_bits + 1)) == 0 &&

	3161 from + differing_bits == to) {

	3162 pos->determines_perfectly = true;

	3163 }

	3164 uint32_t common_bits = ~SmearBitsRight(differing_bits);

	3165 uint32_t bits = (from & common_bits);

	3166 for (intptr_t i = first_range + 1; i < ranges->length(); i++) {

	3167 CharacterRange range = ranges->At(i);

	3168 uint16_t from = range.from();

	3169 uint16_t to = range.to();

	3170 if (from > char_mask) continue;

	3171 if (to > char_mask) to = char_mask;

	3172 // Here we are combining more ranges into the mask and compare

	3173 // value. With each new range the mask becomes more sparse and

	3174 // so the chances of a false positive rise. A character class

	3175 // with multiple ranges is assumed never to be equivalent to a

	3176 // mask and compare operation.

	3177 pos->determines_perfectly = false;

	3178 uint32_t new_common_bits = (from ^ to);

	3179 new_common_bits = ~SmearBitsRight(new_common_bits);

	3180 common_bits &= new_common_bits;

	3181 bits &= new_common_bits;

	3182 uint32_t differing_bits = (from & common_bits) ^ bits;

	3183 common_bits ^= differing_bits;

	3184 bits &= common_bits;

	3185 }

	3186 pos->mask = common_bits;

	3187 pos->value = bits;

	3188 }

	3189 characters_filled_in++;

	3190 ASSERT(characters_filled_in <= details->characters());

	3191 if (characters_filled_in == details->characters()) {

	3192 return;

	3193 }

	3194 }

	3195 }

	3196 ASSERT(characters_filled_in != details->characters());

	3197 if (!details->cannot_match()) {

	3198 on_success()-> GetQuickCheckDetails(details,

	3199 compiler,

	3200 characters_filled_in,

	3201 true);

	3202 }

913 }	3203 }

914	3204

915	3205

916 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,	3206 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

917 RegExpCompiler* compiler,	3207 RegExpCompiler* compiler,

918 intptr_t characters_filled_in,	3208 intptr_t characters_filled_in,

919 bool not_at_start) {	3209 bool not_at_start) {

920 UNIMPLEMENTED();	3210 if (body_can_be_zero_length_ \|\| info()->visited) return;

	3211 VisitMarker marker(info());

	3212 return ChoiceNode::GetQuickCheckDetails(details,

	3213 compiler,

	3214 characters_filled_in,

	3215 not_at_start);

921 }	3216 }

922	3217

923	3218

924 intptr_t TextNode::Length() {	3219 intptr_t TextNode::Length() {

925 TextElement elm = elms_->Last();	3220 TextElement elm = elms_->Last();

926 ASSERT(elm.cp_offset() >= 0);	3221 ASSERT(elm.cp_offset() >= 0);

927 return elm.cp_offset() + elm.length();	3222 return elm.cp_offset() + elm.length();

928 }	3223 }

929	3224

930	3225

(...skipping 518 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1449 Isolate* isolate) {	3744 Isolate* isolate) {

1450 uint16_t bottom = from();	3745 uint16_t bottom = from();

1451 uint16_t top = to();	3746 uint16_t top = to();

1452 if (is_ascii && !RangeContainsLatin1Equivalents(*this)) {	3747 if (is_ascii && !RangeContainsLatin1Equivalents(*this)) {

1453 if (bottom > Symbols::kMaxOneCharCodeSymbol) return;	3748 if (bottom > Symbols::kMaxOneCharCodeSymbol) return;

1454 if (top > Symbols::kMaxOneCharCodeSymbol) {	3749 if (top > Symbols::kMaxOneCharCodeSymbol) {

1455 top = Symbols::kMaxOneCharCodeSymbol;	3750 top = Symbols::kMaxOneCharCodeSymbol;

1456 }	3751 }

1457 }	3752 }

1458	3753

1459 UNIMPLEMENTED();	3754 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize;

	3755 unibrow::Mapping<unibrow::CanonicalizationRange> jsregexp_canonrange;

	3756 int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	3757 if (top == bottom) {

	3758 // If this is a singleton we just expand the one character.

	3759 intptr_t length = jsregexp_uncanonicalize.get(bottom, '\0', chars); // NOLIN T

	3760 for (intptr_t i = 0; i < length; i++) {

	3761 uint32_t chr = chars[i];

	3762 if (chr != bottom) {

	3763 ranges->Add(CharacterRange::Singleton(chars[i]));

	3764 }

	3765 }

	3766 } else {

	3767 // If this is a range we expand the characters block by block,

	3768 // expanding contiguous subranges (blocks) one at a time.

	3769 // The approach is as follows. For a given start character we

	3770 // look up the remainder of the block that contains it (represented

	3771 // by the end point), for instance we find 'z' if the character

	3772 // is 'c'. A block is characterized by the property

	3773 // that all characters uncanonicalize in the same way, except that

	3774 // each entry in the result is incremented by the distance from the first

	3775 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and // NOLINT

	3776 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].

	3777 // Once we've found the end point we look up its uncanonicalization

	3778 // and produce a range for each element. For instance for [c-f]

	3779 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only

	3780 // add a range if it is not already contained in the input, so [c-f]

	3781 // will be skipped but [C-F] will be added. If this range is not

	3782 // completely contained in a block we do this for all the blocks

	3783 // covered by the range (handling characters that is not in a block

	3784 // as a "singleton block").

	3785 int32_t range[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	3786 intptr_t pos = bottom;

	3787 while (pos <= top) {

	3788 intptr_t length = jsregexp_canonrange.get(pos, '\0', range);

	3789 uint16_t block_end;

	3790 if (length == 0) {

	3791 block_end = pos;

	3792 } else {

	3793 ASSERT(length == 1);

	3794 block_end = range[0];

	3795 }

	3796 intptr_t end = (block_end > top) ? top : block_end;

	3797 length = jsregexp_uncanonicalize.get(block_end, '\0', range); // NOLINT

	3798 for (intptr_t i = 0; i < length; i++) {

	3799 uint32_t c = range[i];

	3800 uint16_t range_from = c - (block_end - pos);

	3801 uint16_t range_to = c - (block_end - end);

	3802 if (!(bottom <= range_from && range_to <= top)) {

	3803 ranges->Add(CharacterRange(range_from, range_to));

	3804 }

	3805 }

	3806 pos = end + 1;

	3807 }

	3808 }

	3809 }

1460	3810

1461 // unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	3811

1462 // if (top == bottom) {	3812 // -------------------------------------------------------------------

1463 // // If this is a singleton we just expand the one character.	3813 // Splay tree

1464 // intptr_t length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', ch ars); // NOLINT	3814

1465 // for (intptr_t i = 0; i < length; i++) {	3815

1466 // uint32_t chr = chars[i];	3816 // Workaround for the fact that ZoneGrowableArray does not have contains().

1467 // if (chr != bottom) {	3817 static bool ArrayContains(ZoneGrowableArray<unsigned>* array,

1468 // ranges->Add(CharacterRange::Singleton(chars[i]), zone);	3818 unsigned value) {

1469 // }	3819 for (intptr_t i = 0; i < array->length(); i++) {

1470 // }	3820 if (array->At(i) == value) {

1471 // } else {	3821 return true;

1472 // // If this is a range we expand the characters block by block,	3822 }

1473 // // expanding contiguous subranges (blocks) one at a time.	3823 }

1474 // // The approach is as follows. For a given start character we	3824 return false;

1475 // // look up the remainder of the block that contains it (represented	3825 }

1476 // // by the end point), for instance we find 'z' if the character	3826

1477 // // is 'c'. A block is characterized by the property	3827

1478 // // that all characters uncanonicalize in the same way, except that	3828 void OutSet::Set(unsigned value, Isolate* isolate) {

1479 // // each entry in the result is incremented by the distance from the first	3829 if (value < kFirstLimit) {

1480 // // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] a nd // NOLINT	3830 first_ \|= (1 << value);

1481 // // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].	3831 } else {

1482 // // Once we've found the end point we look up its uncanonicalization	3832 if (remaining_ == NULL)

1483 // // and produce a range for each element. For instance for [c-f]	3833 remaining_ = new(isolate) ZoneGrowableArray<unsigned>(1);

1484 // // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only	3834

1485 // // add a range if it is not already contained in the input, so [c-f]	3835 bool remaining_contains_value = ArrayContains(remaining_, value);

1486 // // will be skipped but [C-F] will be added. If this range is not	3836 if (remaining_->is_empty() \|\| !remaining_contains_value) {

1487 // // completely contained in a block we do this for all the blocks	3837 remaining_->Add(value);

1488 // // covered by the range (handling characters that is not in a block	3838 }

1489 // // as a "singleton block").	3839 }

1490 // unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];	3840 }

1491 // intptr_t pos = bottom;	3841

1492 // while (pos <= top) {	3842

1493 // intptr_t length = isolate->jsregexp_canonrange()->get(pos, '\0', range);	3843 bool OutSet::Get(unsigned value) const {

1494 // uint16_t block_end;	3844 if (value < kFirstLimit) {

1495 // if (length == 0) {	3845 return (first_ & (1 << value)) != 0;

1496 // block_end = pos;	3846 } else if (remaining_ == NULL) {

1497 // } else {	3847 return false;

1498 // ASSERT(length == 1);	3848 } else {

1499 // block_end = range[0];	3849 return ArrayContains(remaining_, value);

1500 // }	3850 }

1501 // intptr_t end = (block_end > top) ? top : block_end;

1502 // length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range) ; // NOLINT

1503 // for (intptr_t i = 0; i < length; i++) {

1504 // uint32_t c = range[i];

1505 // uint16_t range_from = c - (block_end - pos);

1506 // uint16_t range_to = c - (block_end - end);

1507 // if (!(bottom <= range_from && range_to <= top)) {

1508 // ranges->Add(CharacterRange(range_from, range_to));

1509 // }

1510 // }

1511 // pos = end + 1;

1512 // }

1513 // }

1514 }	3851 }

1515	3852

1516 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,	3853 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,

1517 RegExpCompiler* compiler,	3854 RegExpCompiler* compiler,

1518 intptr_t filled_in,	3855 intptr_t filled_in,

1519 bool not_at_start) {	3856 bool not_at_start) {

1520 if (assertion_type_ == AT_START && not_at_start) {	3857 if (assertion_type_ == AT_START && not_at_start) {

1521 details->set_cannot_match();	3858 details->set_cannot_match();

1522 return;	3859 return;

1523 }	3860 }

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1557 intptr_t LoopChoiceNode::EatsAtLeast(intptr_t still_to_find,	3894 intptr_t LoopChoiceNode::EatsAtLeast(intptr_t still_to_find,

1558 intptr_t budget,	3895 intptr_t budget,

1559 bool not_at_start) {	3896 bool not_at_start) {

1560 return EatsAtLeastHelper(still_to_find,	3897 return EatsAtLeastHelper(still_to_find,

1561 budget - 1,	3898 budget - 1,

1562 loop_node_,	3899 loop_node_,

1563 not_at_start);	3900 not_at_start);

1564 }	3901 }

1565	3902

1566	3903

1567 class RecursionCheck {

1568 public:

1569 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {

1570 compiler->IncrementRecursionDepth();

1571 }

1572 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }

1573 private:

1574 RegExpCompiler* compiler_;

1575 };

1576

1577

1578 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {

1579 UNIMPLEMENTED();

1580 return NULL;

1581 }

1582

1583

1584 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,	3904 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

1585 RegExpCompiler* compiler,	3905 RegExpCompiler* compiler,

1586 intptr_t characters_filled_in,	3906 intptr_t characters_filled_in,

1587 bool not_at_start) {	3907 bool not_at_start) {

1588 not_at_start = (not_at_start \|\| not_at_start_);	3908 not_at_start = (not_at_start \|\| not_at_start_);

1589 intptr_t choice_count = alternatives_->length();	3909 intptr_t choice_count = alternatives_->length();

1590 ASSERT(choice_count > 0);	3910 ASSERT(choice_count > 0);

1591 (*alternatives_)[0].node()->GetQuickCheckDetails(details,	3911 (*alternatives_)[0].node()->GetQuickCheckDetails(details,

1592 compiler,	3912 compiler,

1593 characters_filled_in,	3913 characters_filled_in,

(...skipping 102 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1696 RegExpCompiler* compiler,	4016 RegExpCompiler* compiler,

1697 intptr_t filled_in,	4017 intptr_t filled_in,

1698 bool not_at_start) {	4018 bool not_at_start) {

1699 // Alternative 0 is the negative lookahead, alternative 1 is what comes	4019 // Alternative 0 is the negative lookahead, alternative 1 is what comes

1700 // afterwards.	4020 // afterwards.

1701 RegExpNode* node = (*alternatives_)[1].node();	4021 RegExpNode* node = (*alternatives_)[1].node();

1702 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);	4022 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);

1703 }	4023 }

1704	4024

1705	4025

	4026 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {

	4027 return RegExpEngine::CompilationResult("RegExp too big");

	4028 }

	4029

	4030

1706 // Attempts to compile the regexp using an Irregexp code generator. Returns	4031 // Attempts to compile the regexp using an Irregexp code generator. Returns

1707 // a fixed array or a null handle depending on whether it succeeded.	4032 // a fixed array or a null handle depending on whether it succeeded.

1708 RegExpCompiler::RegExpCompiler(intptr_t capture_count, bool ignore_case,	4033 RegExpCompiler::RegExpCompiler(intptr_t capture_count, bool ignore_case,

1709 bool ascii)	4034 intptr_t specialization_cid)

1710 : next_register_(2 * (capture_count + 1)),	4035 : next_register_(2 * (capture_count + 1)),

	4036 work_list_(NULL),

1711 recursion_depth_(0),	4037 recursion_depth_(0),

1712 ignore_case_(ignore_case),	4038 ignore_case_(ignore_case),

1713 ascii_(ascii),	4039 specialization_cid_(specialization_cid),

1714 reg_exp_too_big_(false),	4040 reg_exp_too_big_(false),

1715 current_expansion_factor_(1),	4041 current_expansion_factor_(1),

1716 isolate_(Isolate::Current()) {	4042 isolate_(Isolate::Current()) {

1717 accept_ = new(I) EndNode(EndNode::ACCEPT, I);	4043 accept_ = new(I) EndNode(EndNode::ACCEPT, I);

1718 }	4044 }

1719	4045

1720	4046

1721 RegExpEngine::CompilationResult RegExpCompiler::Assemble(	4047 RegExpEngine::CompilationResult RegExpCompiler::Assemble(

1722 RegExpMacroAssembler* macro_assembler,	4048 IRRegExpMacroAssembler* macro_assembler,

1723 RegExpNode* start,	4049 RegExpNode* start,

1724 intptr_t capture_count,	4050 intptr_t capture_count,

1725 const String& pattern) {	4051 const String& pattern) {

1726 UNIMPLEMENTED();	4052 static const bool use_slow_safe_regexp_compiler = false;

1727 return RegExpEngine::CompilationResult("");	4053

	4054 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);

	4055 macro_assembler_ = macro_assembler;

	4056

	4057 ZoneGrowableArray<RegExpNode*> work_list(0);

	4058 work_list_ = &work_list;

	4059 BlockLabel fail;

	4060 macro_assembler_->PushBacktrack(&fail);

	4061 Trace new_trace;

	4062 start->Emit(this, &new_trace);

	4063 macro_assembler_->BindBlock(&fail);

	4064 macro_assembler_->Fail();

	4065 while (!work_list.is_empty()) {

	4066 work_list.RemoveLast()->Emit(this, &new_trace);

	4067 }

	4068 if (reg_exp_too_big_) return IrregexpRegExpTooBig();

	4069

	4070 macro_assembler->FinalizeIndirectGotos();

	4071

	4072 return RegExpEngine::CompilationResult(macro_assembler,

	4073 macro_assembler->graph_entry(),

	4074 macro_assembler->num_blocks(),

	4075 macro_assembler->num_stack_locals());

1728 }	4076 }

1729	4077

1730	4078

1731 RegExpEngine::CompilationResult RegExpEngine::Compile(	4079 RegExpEngine::CompilationResult RegExpEngine::Compile(

1732 RegExpCompileData* data,	4080 RegExpCompileData* data,

1733 bool ignore_case,	4081 const ParsedFunction* parsed_function,

1734 bool is_global,	4082 ZoneGrowableArray<const ICData> ic_data_array) {

1735 bool is_multiline,

1736 const String& pattern,

1737 const String& sample_subject,

1738 bool is_ascii) {

1739 Isolate* isolate = Isolate::Current();	4083 Isolate* isolate = Isolate::Current();

1740	4084

1741 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);	4085 const Function& function = parsed_function->function();

	4086 const intptr_t specialization_cid = function.regexp_specialization();

	4087 const bool is_ascii = (specialization_cid == kOneByteStringCid \|\|

	4088 specialization_cid == kExternalOneByteStringCid);

	4089 JSRegExp& regexp = JSRegExp::Handle(isolate, function.regexp());

	4090 const String& pattern = String::Handle(isolate, regexp.pattern());

	4091 const String& sample_subject =

	4092 String::Handle(isolate, regexp.sample_subject(specialization_cid));

1742	4093

1743 // TODO(jgruber): Character frequency sampling.	4094 ASSERT(!regexp.IsNull());

	4095 ASSERT(!pattern.IsNull());

	4096 ASSERT(!sample_subject.IsNull());

	4097

	4098 const bool ignore_case = regexp.is_ignore_case();

	4099 const bool is_global = regexp.is_global();

	4100

	4101 RegExpCompiler compiler(data->capture_count, ignore_case, specialization_cid);

	4102

	4103 // Sample some characters from the middle of the string.

	4104 static const intptr_t kSampleSize = 128;

	4105

	4106 intptr_t chars_sampled = 0;

	4107 intptr_t half_way = (sample_subject.Length() - kSampleSize) / 2;

	4108 for (intptr_t i = Utils::Maximum(static_cast<intptr_t>(0), half_way);

	4109 i < sample_subject.Length() && chars_sampled < kSampleSize;

	4110 i++, chars_sampled++) {

	4111 compiler.frequency_collator()->CountCharacter(sample_subject.CharAt(i));

	4112 }

1744	4113

1745 // Wrap the body of the regexp in capture #0.	4114 // Wrap the body of the regexp in capture #0.

1746 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,	4115 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,

1747 0,	4116 0,

1748 &compiler,	4117 &compiler,

1749 compiler.accept());	4118 compiler.accept());

1750	4119

1751 RegExpNode* node = captured_body;	4120 RegExpNode* node = captured_body;

1752 bool is_start_anchored = data->tree->IsAnchoredAtStart();	4121 bool is_start_anchored = data->tree->IsAnchoredAtStart();

	4122 bool is_end_anchored = data->tree->IsAnchoredAtEnd();

	4123 intptr_t max_length = data->tree->max_match();

1753 if (!is_start_anchored) {	4124 if (!is_start_anchored) {

1754 // Add a .*? at the beginning, outside the body capture, unless	4125 // Add a .*? at the beginning, outside the body capture, unless

1755 // this expression is anchored at the beginning.	4126 // this expression is anchored at the beginning.

1756 RegExpNode* loop_node =	4127 RegExpNode* loop_node =

1757 RegExpQuantifier::ToNode(0,	4128 RegExpQuantifier::ToNode(0,

1758 RegExpTree::kInfinity,	4129 RegExpTree::kInfinity,

1759 false,	4130 false,

1760 new(isolate) RegExpCharacterClass('*'),	4131 new(isolate) RegExpCharacterClass('*'),

1761 &compiler,	4132 &compiler,

1762 captured_body,	4133 captured_body,

(...skipping 23 matching lines...) Expand all Loading...
1786	4157

1787 if (node == NULL) node = new(isolate) EndNode(EndNode::BACKTRACK, isolate);	4158 if (node == NULL) node = new(isolate) EndNode(EndNode::BACKTRACK, isolate);

1788 data->node = node;	4159 data->node = node;

1789 Analysis analysis(ignore_case, is_ascii);	4160 Analysis analysis(ignore_case, is_ascii);

1790 analysis.EnsureAnalyzed(node);	4161 analysis.EnsureAnalyzed(node);

1791 if (analysis.has_failed()) {	4162 if (analysis.has_failed()) {

1792 const char* error_message = analysis.error_message();	4163 const char* error_message = analysis.error_message();

1793 return CompilationResult(error_message);	4164 return CompilationResult(error_message);

1794 }	4165 }

1795	4166

1796 // TODO(jgruber): Assemble native code.	4167 // Native regexp implementation.

1797	4168

1798 return RegExpEngine::CompilationResult("");	4169 IRRegExpMacroAssembler* macro_assembler =

1799 }	4170 new(isolate) IRRegExpMacroAssembler(specialization_cid,

1800	4171 data->capture_count,

1801	4172 parsed_function,

	4173 ic_data_array,

	4174 isolate);

	4175

	4176 // Inserted here, instead of in Assembler, because it depends on information

	4177 // in the AST that isn't replicated in the Node structure.

	4178 static const intptr_t kMaxBacksearchLimit = 1024;

	4179 if (is_end_anchored &&

	4180 !is_start_anchored &&

	4181 max_length < kMaxBacksearchLimit) {

	4182 macro_assembler->SetCurrentPositionFromEnd(max_length);

	4183 }

	4184

	4185 if (is_global) {

	4186 macro_assembler->set_global_mode(

	4187 (data->tree->min_match() > 0)

	4188 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK

	4189 : RegExpMacroAssembler::GLOBAL);

	4190 }

	4191

	4192 RegExpEngine::CompilationResult result =

	4193 compiler.Assemble(macro_assembler,

	4194 node,

	4195 data->capture_count,

	4196 pattern);

	4197

	4198 if (FLAG_trace_irregexp) {

	4199 macro_assembler->PrintBlocks();

	4200 }

	4201

	4202 return result;

	4203 }

	4204

	4205

	4206 static void CreateSpecializedFunction(Isolate* isolate,

	4207 JSRegExp* regexp,

	4208 intptr_t specialization_cid,

	4209 const Object& owner) {

	4210 const intptr_t kParamCount = RegExpMacroAssembler::kParamCount;

	4211

	4212 Function& fn = Function::Handle(isolate,

	4213 Function::New(String::Handle(isolate, Symbols::New("RegExp")),

	4214 RawFunction::kIrregexpFunction,

	4215 true, // Static.

	4216 false, // Not const.

	4217 false, // Not abstract.

	4218 false, // Not external.

	4219 false, // Not native.

	4220 owner,

	4221 0)); // Requires a non-negative token position.

	4222

	4223 fn.set_num_fixed_parameters(kParamCount);

	4224 fn.set_parameter_types(

	4225 Array::Handle(isolate, Array::New(kParamCount, Heap::kOld)));

	4226 fn.set_parameter_names(

	4227 Array::Handle(isolate, Array::New(kParamCount, Heap::kOld)));

	4228 fn.SetParameterTypeAt(0, Type::Handle(isolate, Type::DynamicType()));

	4229 fn.SetParameterNameAt(0, Symbols::string_param_());

	4230 fn.SetParameterTypeAt(1, Type::Handle(isolate, Type::DynamicType()));

	4231 fn.SetParameterNameAt(1, Symbols::start_index_param_());

	4232 fn.set_result_type(Type::Handle(isolate, Type::ArrayType()));

	4233

	4234 // Cache the result.

	4235 regexp->set_function(specialization_cid, fn);

	4236

	4237 fn.set_regexp(*regexp);

	4238 fn.set_regexp_specialization(specialization_cid);

	4239

	4240 // The function is compiled lazily during the first call.

	4241 }

	4242

	4243

	4244 RawJSRegExp* RegExpEngine::New(Isolate* isolate,

	4245 const String& pattern,

	4246 bool multi_line,

	4247 bool ignore_case) {

	4248 JSRegExp& regexp = JSRegExp::Handle(JSRegExp::New(0));

	4249

	4250 regexp.set_pattern(pattern);

	4251

	4252 if (multi_line) regexp.set_is_multi_line();

	4253 if (ignore_case) regexp.set_is_ignore_case();

	4254

	4255 // TODO(jgruber): We might want to use normal string searching algorithms

	4256 // for simple patterns.

	4257 regexp.set_is_complex();

	4258 regexp.set_is_global(); // All dart regexps are global.

	4259

	4260 const Library& lib = Library::Handle(isolate, Library::CoreLibrary());

	4261 const Class& owner = Class::Handle(isolate,

	4262 lib.LookupClass(String::Handle(isolate, Symbols::New("RegExp"))));

	4263

	4264 CreateSpecializedFunction(isolate, &regexp, kOneByteStringCid, owner);

	4265 CreateSpecializedFunction(isolate, &regexp, kTwoByteStringCid, owner);

	4266 CreateSpecializedFunction(isolate, &regexp, kExternalOneByteStringCid, owner);

	4267 CreateSpecializedFunction(isolate, &regexp, kExternalTwoByteStringCid, owner);

	4268

	4269 return regexp.raw();

	4270 }

	4271

	4272

	4273 void BoyerMoorePositionInfo::Set(intptr_t character) {

	4274 SetInterval(Interval(character, character));

	4275 }

	4276

	4277

	4278 ContainedInLattice AddRange(ContainedInLattice containment,

	4279 const intptr_t* ranges,

	4280 intptr_t ranges_length,

	4281 Interval new_range) {

	4282 ASSERT((ranges_length & 1) == 1);

	4283 ASSERT(ranges[ranges_length - 1] == Utf16::kMaxCodeUnit + 1);

	4284 if (containment == kLatticeUnknown) return containment;

	4285 bool inside = false;

	4286 intptr_t last = 0;

	4287 for (intptr_t i = 0; i < ranges_length;

	4288 inside = !inside, last = ranges[i], i++) {

	4289 // Consider the range from last to ranges[i].

	4290 // We haven't got to the new range yet.

	4291 if (ranges[i] <= new_range.from()) continue;

	4292 // New range is wholly inside last-ranges[i]. Note that new_range.to() is

	4293 // inclusive, but the values in ranges are not.

	4294 if (last <= new_range.from() && new_range.to() < ranges[i]) {

	4295 return Combine(containment, inside ? kLatticeIn : kLatticeOut);

	4296 }

	4297 return kLatticeUnknown;

	4298 }

	4299 return containment;

	4300 }

	4301

	4302

	4303 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {

	4304 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);

	4305 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);

	4306 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);

	4307 surrogate_ =

	4308 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);

	4309 if (interval.to() - interval.from() >= kMapSize - 1) {

	4310 if (map_count_ != kMapSize) {

	4311 map_count_ = kMapSize;

	4312 for (intptr_t i = 0; i < kMapSize; i++) (*map_)[i] = true;

	4313 }

	4314 return;

	4315 }

	4316 for (intptr_t i = interval.from(); i <= interval.to(); i++) {

	4317 intptr_t mod_character = (i & kMask);

	4318 if (!map_->At(mod_character)) {

	4319 map_count_++;

	4320 (*map_)[mod_character] = true;

	4321 }

	4322 if (map_count_ == kMapSize) return;

	4323 }

	4324 }

	4325

	4326

	4327 void BoyerMoorePositionInfo::SetAll() {

	4328 s_ = w_ = d_ = kLatticeUnknown;

	4329 if (map_count_ != kMapSize) {

	4330 map_count_ = kMapSize;

	4331 for (intptr_t i = 0; i < kMapSize; i++) (*map_)[i] = true;

	4332 }

	4333 }

	4334

	4335

	4336 BoyerMooreLookahead::BoyerMooreLookahead(

	4337 intptr_t length, RegExpCompiler* compiler, Isolate* isolate)

	4338 : length_(length),

	4339 compiler_(compiler) {

	4340 if (compiler->ascii()) {

	4341 max_char_ = Symbols::kMaxOneCharCodeSymbol;

	4342 } else {

	4343 max_char_ = Utf16::kMaxCodeUnit;

	4344 }

	4345 bitmaps_ = new(isolate) ZoneGrowableArray<BoyerMoorePositionInfo*>(length);

	4346 for (intptr_t i = 0; i < length; i++) {

	4347 bitmaps_->Add(new(isolate) BoyerMoorePositionInfo(isolate));

	4348 }

	4349 }

	4350

	4351

	4352 // Take all the characters that will not prevent a successful match if they

	4353 // occur in the subject string in the range between min_lookahead and

	4354 // max_lookahead (inclusive) measured from the current position. If the

	4355 // character at max_lookahead offset is not one of these characters, then we

	4356 // can safely skip forwards by the number of characters in the range.

	4357 intptr_t BoyerMooreLookahead::GetSkipTable(intptr_t min_lookahead,

	4358 intptr_t max_lookahead,

	4359 TypedData* boolean_skip_table) {
	Florian Schneider 2014/10/01 17:04:14 const TypedData& since this will never be NULL. const TypedData& since this will never be NULL. jgruber1 2014/10/03 18:59:52 Done. Show quoted text On 2014/10/01 17:04:14, Florian Schneider wrote: > const TypedData& since this will never be NULL. Done.
	4360 const intptr_t kSize = RegExpMacroAssembler::kTableSize;

	4361

	4362 const intptr_t kSkipArrayEntry = 0;

	4363 const intptr_t kDontSkipArrayEntry = 1;

	4364

	4365 for (intptr_t i = 0; i < kSize; i++) {

	4366 boolean_skip_table->SetUint8(i, kSkipArrayEntry);

	4367 }

	4368 intptr_t skip = max_lookahead + 1 - min_lookahead;

	4369

	4370 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {

	4371 BoyerMoorePositionInfo* map = bitmaps_->At(i);

	4372 for (intptr_t j = 0; j < kSize; j++) {

	4373 if (map->at(j)) {

	4374 boolean_skip_table->SetUint8(j, kDontSkipArrayEntry);

	4375 }

	4376 }

	4377 }

	4378

	4379 return skip;

	4380 }

	4381

	4382

	4383 // Find the longest range of lookahead that has the fewest number of different

	4384 // characters that can occur at a given position. Since we are optimizing two

	4385 // different parameters at once this is a tradeoff.

	4386 bool BoyerMooreLookahead::FindWorthwhileInterval(intptr_t* from, intptr_t* to) {

	4387 intptr_t biggest_points = 0;

	4388 // If more than 32 characters out of 128 can occur it is unlikely that we can

	4389 // be lucky enough to step forwards much of the time.

	4390 const intptr_t kMaxMax = 32;

	4391 for (intptr_t max_number_of_chars = 4;

	4392 max_number_of_chars < kMaxMax;

	4393 max_number_of_chars *= 2) {

	4394 biggest_points =

	4395 FindBestInterval(max_number_of_chars, biggest_points, from, to);

	4396 }

	4397 if (biggest_points == 0) return false;

	4398 return true;

	4399 }

	4400

	4401

	4402 // Find the highest-points range between 0 and length_ where the character

	4403 // information is not too vague. 'Too vague' means that there are more than

	4404 // max_number_of_chars that can occur at this position. Calculates the number

	4405 // of points as the product of width-of-the-range and

	4406 // probability-of-finding-one-of-the-characters, where the probability is

	4407 // calculated using the frequency distribution of the sample subject string.

	4408 intptr_t BoyerMooreLookahead::FindBestInterval(

	4409 intptr_t max_number_of_chars,

	4410 intptr_t old_biggest_points,

	4411 intptr_t* from,

	4412 intptr_t* to) {

	4413 intptr_t biggest_points = old_biggest_points;

	4414 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;

	4415 for (intptr_t i = 0; i < length_; ) {

	4416 while (i < length_ && Count(i) > max_number_of_chars) i++;

	4417 if (i == length_) break;

	4418 intptr_t remembered_from = i;

	4419 bool union_map[kSize];

	4420 for (intptr_t j = 0; j < kSize; j++) union_map[j] = false;

	4421 while (i < length_ && Count(i) <= max_number_of_chars) {

	4422 BoyerMoorePositionInfo* map = bitmaps_->At(i);

	4423 for (intptr_t j = 0; j < kSize; j++) union_map[j] \|= map->at(j);

	4424 i++;

	4425 }

	4426 intptr_t frequency = 0;

	4427 for (intptr_t j = 0; j < kSize; j++) {

	4428 if (union_map[j]) {

	4429 // Add 1 to the frequency to give a small per-character boost for

	4430 // the cases where our sampling is not good enough and many

	4431 // characters have a frequency of zero. This means the frequency

	4432 // can theoretically be up to 2*kSize though we treat it mostly as

	4433 // a fraction of kSize.

	4434 frequency += compiler_->frequency_collator()->Frequency(j) + 1;

	4435 }

	4436 }

	4437 // We use the probability of skipping times the distance we are skipping to

	4438 // judge the effectiveness of this. Actually we have a cut-off: By

	4439 // dividing by 2 we switch off the skipping if the probability of skipping

	4440 // is less than 50%. This is because the multibyte mask-and-compare

	4441 // skipping in quickcheck is more likely to do well on this case.

	4442 bool in_quickcheck_range = ((i - remembered_from < 4) \|\|

	4443 (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2));

	4444 // Called 'probability' but it is only a rough estimate and can actually

	4445 // be outside the 0-kSize range.

	4446 intptr_t probability =

	4447 (in_quickcheck_range ? kSize / 2 : kSize) - frequency;

	4448 intptr_t points = (i - remembered_from) * probability;

	4449 if (points > biggest_points) {

	4450 *from = remembered_from;

	4451 *to = i - 1;

	4452 biggest_points = points;

	4453 }

	4454 }

	4455 return biggest_points;

	4456 }

	4457

	4458

	4459 // See comment above on the implementation of GetSkipTable.

	4460 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {

	4461 const intptr_t kSize = RegExpMacroAssembler::kTableSize;

	4462

	4463 intptr_t min_lookahead = 0;

	4464 intptr_t max_lookahead = 0;

	4465

	4466 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false;

	4467

	4468 bool found_single_character = false;

	4469 intptr_t single_character = 0;

	4470 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {

	4471 BoyerMoorePositionInfo* map = bitmaps_->At(i);

	4472 if (map->map_count() > 1 \|\|

	4473 (found_single_character && map->map_count() != 0)) {

	4474 found_single_character = false;

	4475 break;

	4476 }

	4477 for (intptr_t j = 0; j < kSize; j++) {

	4478 if (map->at(j)) {

	4479 found_single_character = true;

	4480 single_character = j;

	4481 break;

	4482 }

	4483 }

	4484 }

	4485

	4486 intptr_t lookahead_width = max_lookahead + 1 - min_lookahead;

	4487

	4488 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {

	4489 // The mask-compare can probably handle this better.

	4490 return false;

	4491 }

	4492

	4493 if (found_single_character) {

	4494 BlockLabel cont, again;

	4495 masm->BindBlock(&again);

	4496 masm->LoadCurrentCharacter(max_lookahead, &cont, true);

	4497 if (max_char_ > kSize) {

	4498 masm->CheckCharacterAfterAnd(single_character,

	4499 RegExpMacroAssembler::kTableMask,

	4500 &cont);

	4501 } else {

	4502 masm->CheckCharacter(single_character, &cont);

	4503 }

	4504 masm->AdvanceCurrentPosition(lookahead_width);

	4505 masm->GoTo(&again);

	4506 masm->BindBlock(&cont);

	4507 return true;

	4508 }

	4509

	4510 TypedData& boolean_skip_table = TypedData::ZoneHandle(
	Florian Schneider 2014/10/01 17:04:14 const TypedData& const TypedData& jgruber1 2014/10/03 18:59:52 Done. Show quoted text On 2014/10/01 17:04:14, Florian Schneider wrote: > const TypedData& Done.
	4511 compiler_->isolate(),

	4512 TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));

	4513 intptr_t skip_distance = GetSkipTable(

	4514 min_lookahead, max_lookahead, &boolean_skip_table);

	4515 ASSERT(skip_distance != 0);

	4516

	4517 BlockLabel cont, again;

	4518

	4519 masm->BindBlock(&again);

	4520 masm->LoadCurrentCharacter(max_lookahead, &cont, true);

	4521 masm->CheckBitInTable(boolean_skip_table, &cont);

	4522 masm->AdvanceCurrentPosition(skip_distance);

	4523 masm->GoTo(&again);

	4524 masm->BindBlock(&cont);

	4525

	4526 return true;

	4527 }

	4528

	4529

1802 // -------------------------------------------------------------------	4530 // -------------------------------------------------------------------

1803 // Analysis	4531 // Analysis

1804	4532

1805	4533

1806 void Analysis::EnsureAnalyzed(RegExpNode* that) {	4534 void Analysis::EnsureAnalyzed(RegExpNode* that) {

1807 if (that->info()->been_analyzed \|\| that->info()->being_analyzed)	4535 if (that->info()->been_analyzed \|\| that->info()->being_analyzed)

1808 return;	4536 return;

1809 that->info()->being_analyzed = true;	4537 that->info()->being_analyzed = true;

1810 that->Accept(this);	4538 that->Accept(this);

1811 that->info()->being_analyzed = false;	4539 that->info()->being_analyzed = false;

(...skipping 168 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1980	4708

1981	4709

1982 void DotPrinter::PrintAttributes(RegExpNode* that) {	4710 void DotPrinter::PrintAttributes(RegExpNode* that) {

1983 OS::Print(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "	4711 OS::Print(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "

1984 "margin=0.1, fontsize=10, label=\"{", that);	4712 "margin=0.1, fontsize=10, label=\"{", that);

1985 AttributePrinter printer;	4713 AttributePrinter printer;

1986 NodeInfo* info = that->info();	4714 NodeInfo* info = that->info();

1987 printer.PrintBit("NI", info->follows_newline_interest);	4715 printer.PrintBit("NI", info->follows_newline_interest);

1988 printer.PrintBit("WI", info->follows_word_interest);	4716 printer.PrintBit("WI", info->follows_word_interest);

1989 printer.PrintBit("SI", info->follows_start_interest);	4717 printer.PrintBit("SI", info->follows_start_interest);

1990 Label* label = that->label();	4718 BlockLabel* label = that->label();

1991 if (label->IsBound())	4719 if (label->IsBound())

1992 printer.PrintPositive("@", label->Position());	4720 printer.PrintPositive("@", label->Position());

1993 OS::Print("}\"];\n"	4721 OS::Print("}\"];\n"

1994 " a%p -> n%p [style=dashed, color=grey, arrowhead=none];\n",	4722 " a%p -> n%p [style=dashed, color=grey, arrowhead=none];\n",

1995 that, that);	4723 that, that);

1996 }	4724 }

1997	4725

1998	4726

1999 static const bool kPrintDispatchTable = false;	4727 static const bool kPrintDispatchTable = false;

2000 void DotPrinter::VisitChoice(ChoiceNode* that) {	4728 void DotPrinter::VisitChoice(ChoiceNode* that) {

(...skipping 141 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2142 RegExpNode* node,	4870 RegExpNode* node,

2143 bool ignore_case) {	4871 bool ignore_case) {

2144 DotPrinter printer(ignore_case);	4872 DotPrinter printer(ignore_case);

2145 printer.PrintNode(label, node);	4873 printer.PrintNode(label, node);

2146 }	4874 }

2147	4875

2148	4876

2149 #endif // DEBUG	4877 #endif // DEBUG

2150	4878

2151 } // namespace dart	4879 } // namespace dart

OLD	NEW

« runtime/vm/regexp.h ('K') | « runtime/vm/regexp.h ('k') | runtime/vm/regexp_assembler.h » ('j') | runtime/vm/regexp_assembler.h » ('J')