Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(19)

Side by Side Diff: runtime/vm/regexp.cc

Issue 539153002: Port and integrate the irregexp engine from V8 (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 #include "vm/regexp.h" 5 #include "vm/regexp.h"
6 6
7 #include "vm/dart_entry.h"
8 #include "vm/regexp_assembler.h"
7 #include "vm/regexp_ast.h" 9 #include "vm/regexp_ast.h"
10 #include "vm/unibrow-inl.h"
8 #include "vm/unicode.h" 11 #include "vm/unicode.h"
9 #include "vm/symbols.h" 12 #include "vm/symbols.h"
10 13
11 #define I isolate() 14 #define I isolate()
12 #define CI compiler->isolate() 15 #define CI compiler->isolate()
13 16
14 namespace dart { 17 namespace dart {
15 18
19 DECLARE_FLAG(bool, trace_irregexp);
16 20
17 #define DEFINE_ACCEPT(Type) \ 21 #define DEFINE_ACCEPT(Type) \
18 void Type##Node::Accept(NodeVisitor* visitor) { \ 22 void Type##Node::Accept(NodeVisitor* visitor) { \
19 visitor->Visit##Type(this); \ 23 visitor->Visit##Type(this); \
20 } 24 }
21 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) 25 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
22 #undef DEFINE_ACCEPT 26 #undef DEFINE_ACCEPT
23 27
24 28
25 // Default to generating optimized regexp code. 29 // Default to generating optimized regexp code.
26 static const bool kRegexpOptimization = true; 30 static const bool kRegexpOptimization = true;
27 31
28
29 // More makes code generation slower, less makes V8 benchmark score lower. 32 // More makes code generation slower, less makes V8 benchmark score lower.
30 static const intptr_t kMaxLookaheadForBoyerMoore = 8; 33 static const intptr_t kMaxLookaheadForBoyerMoore = 8;
34
31 // In a 3-character pattern you can maximally step forwards 3 characters 35 // In a 3-character pattern you can maximally step forwards 3 characters
32 // at a time, which is not always enough to pay for the extra logic. 36 // at a time, which is not always enough to pay for the extra logic.
33 static const intptr_t kPatternTooShortForBoyerMoore = 2; 37 static const intptr_t kPatternTooShortForBoyerMoore = 2;
34 38
35
36 // The '2' variant has inclusive from and exclusive to. 39 // The '2' variant has inclusive from and exclusive to.
37 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12, 40 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
38 // which include WhiteSpace (7.2) or LineTerminator (7.3) values. 41 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
39 static const intptr_t kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 42 static const intptr_t kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
40 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 43 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
41 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 44 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
42 0xFEFF, 0xFF00, 0x10000 }; 45 0xFEFF, 0xFF00, 0x10000 };
43 static const intptr_t kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges); 46 static const intptr_t kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
44 static const intptr_t kWordRanges[] = { 47 static const intptr_t kWordRanges[] = {
45 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 }; 48 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
46 static const intptr_t kWordRangeCount = ARRAY_SIZE(kWordRanges); 49 static const intptr_t kWordRangeCount = ARRAY_SIZE(kWordRanges);
47 static const intptr_t kDigitRanges[] = { '0', '9' + 1, 0x10000 }; 50 static const intptr_t kDigitRanges[] = { '0', '9' + 1, 0x10000 };
48 static const intptr_t kDigitRangeCount = ARRAY_SIZE(kDigitRanges); 51 static const intptr_t kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
52 static const intptr_t kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
53 static const intptr_t kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges);
49 static const intptr_t kLineTerminatorRanges[] = { 54 static const intptr_t kLineTerminatorRanges[] = {
50 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, 0x10000 }; 55 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, 0x10000 };
51 static const intptr_t kLineTerminatorRangeCount = 56 static const intptr_t kLineTerminatorRangeCount =
52 ARRAY_SIZE(kLineTerminatorRanges); 57 ARRAY_SIZE(kLineTerminatorRanges);
53 58
54 59
55 // TODO(jgruber): Remove me.
56 class RegExpMacroAssembler {
57 public:
58 // The implementation must be able to handle at least:
59 static const intptr_t kMaxRegister = (1 << 16) - 1;
60 };
61
62
63 // TODO(jgruber): Copied from regexp_ast.cc
64 static inline void PrintUtf16(uint16_t c) { 60 static inline void PrintUtf16(uint16_t c) {
65 const char* format = (0x20 <= c && c <= 0x7F) 61 const char* format = (0x20 <= c && c <= 0x7F) ?
66 ? "%c" 62 "%c" : (c <= 0xff) ? "\\x%02x" : "\\u%04x";
67 : (c <= 0xff) ? "\\x%02x" : "\\u%04x";
68 OS::Print(format, c); 63 OS::Print(format, c);
69 } 64 }
70 65
71 66
72 // We need to check for the following characters: 0x39c 0x3bc 0x178. 67 // We need to check for the following characters: 0x39c 0x3bc 0x178.
73 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { 68 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
74 // TODO(dcarney): this could be a lot more efficient. 69 // TODO(dcarney): this could be a lot more efficient.
75 return range.Contains(0x39c) || 70 return range.Contains(0x39c) ||
76 range.Contains(0x3bc) || range.Contains(0x178); 71 range.Contains(0x3bc) || range.Contains(0x178);
77 } 72 }
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
126 return next_register_; 121 return next_register_;
127 } 122 }
128 return next_register_++; 123 return next_register_++;
129 } 124 }
130 125
131 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler, 126 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
132 RegExpNode* start, 127 RegExpNode* start,
133 intptr_t capture_count, 128 intptr_t capture_count,
134 const String& pattern); 129 const String& pattern);
135 130
131 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
132
136 static const intptr_t kImplementationOffset = 0; 133 static const intptr_t kImplementationOffset = 0;
137 static const intptr_t kNumberOfRegistersOffset = 0; 134 static const intptr_t kNumberOfRegistersOffset = 0;
138 static const intptr_t kCodeOffset = 1; 135 static const intptr_t kCodeOffset = 1;
139 136
140 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } 137 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
141 EndNode* accept() { return accept_; } 138 EndNode* accept() { return accept_; }
142 139
143 static const intptr_t kMaxRecursion = 100; 140 static const intptr_t kMaxRecursion = 100;
144 inline intptr_t recursion_depth() { return recursion_depth_; } 141 inline intptr_t recursion_depth() { return recursion_depth_; }
145 inline void IncrementRecursionDepth() { recursion_depth_++; } 142 inline void IncrementRecursionDepth() { recursion_depth_++; }
146 inline void DecrementRecursionDepth() { recursion_depth_--; } 143 inline void DecrementRecursionDepth() { recursion_depth_--; }
147 144
148 void SetRegExpTooBig() { reg_exp_too_big_ = true; } 145 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
149 146
150 inline bool ignore_case() { return ignore_case_; } 147 inline bool ignore_case() { return ignore_case_; }
151 inline bool ascii() { return ascii_; } 148 inline bool ascii() { return ascii_; }
152 149
153 intptr_t current_expansion_factor() { return current_expansion_factor_; } 150 intptr_t current_expansion_factor() { return current_expansion_factor_; }
154 void set_current_expansion_factor(intptr_t value) { 151 void set_current_expansion_factor(intptr_t value) {
155 current_expansion_factor_ = value; 152 current_expansion_factor_ = value;
156 } 153 }
157 154
158 Isolate* isolate() const { return isolate_; } 155 Isolate* isolate() const { return isolate_; }
159 156
160 static const intptr_t kNoRegister = -1; 157 static const intptr_t kNoRegister = -1;
161 158
162 private: 159 private:
163 EndNode* accept_; 160 EndNode* accept_;
164 intptr_t next_register_; 161 intptr_t next_register_;
162 ZoneGrowableArray<RegExpNode*>* work_list_;
165 intptr_t recursion_depth_; 163 intptr_t recursion_depth_;
166 RegExpMacroAssembler* macro_assembler_; 164 RegExpMacroAssembler* macro_assembler_;
167 bool ignore_case_; 165 bool ignore_case_;
168 bool ascii_; 166 bool ascii_;
169 bool reg_exp_too_big_; 167 bool reg_exp_too_big_;
170 intptr_t current_expansion_factor_; 168 intptr_t current_expansion_factor_;
171 Isolate* isolate_; 169 Isolate* isolate_;
172 }; 170 };
173 171
174 172
173 class RecursionCheck {
174 public:
175 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
176 compiler->IncrementRecursionDepth();
177 }
178 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
179 private:
180 RegExpCompiler* compiler_;
181 };
182
183
175 // Scoped object to keep track of how much we unroll quantifier loops in the 184 // Scoped object to keep track of how much we unroll quantifier loops in the
176 // regexp graph generator. 185 // regexp graph generator.
177 class RegExpExpansionLimiter { 186 class RegExpExpansionLimiter {
178 public: 187 public:
179 static const intptr_t kMaxExpansionFactor = 6; 188 static const intptr_t kMaxExpansionFactor = 6;
180 RegExpExpansionLimiter(RegExpCompiler* compiler, intptr_t factor) 189 RegExpExpansionLimiter(RegExpCompiler* compiler, intptr_t factor)
181 : compiler_(compiler), 190 : compiler_(compiler),
182 saved_expansion_factor_(compiler->current_expansion_factor()), 191 saved_expansion_factor_(compiler->current_expansion_factor()),
183 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { 192 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
184 ASSERT(factor > 0); 193 ASSERT(factor > 0);
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
224 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 233 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
225 RegExpNode* on_success) { 234 RegExpNode* on_success) {
226 ZoneGrowableArray<TextElement>* elms = 235 ZoneGrowableArray<TextElement>* elms =
227 new(CI) ZoneGrowableArray<TextElement>(1); 236 new(CI) ZoneGrowableArray<TextElement>(1);
228 for (intptr_t i = 0; i < elements()->length(); i++) { 237 for (intptr_t i = 0; i < elements()->length(); i++) {
229 elms->Add(elements()->At(i)); 238 elms->Add(elements()->At(i));
230 } 239 }
231 return new(CI) TextNode(elms, on_success); 240 return new(CI) TextNode(elms, on_success);
232 } 241 }
233 242
243
234 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 244 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
235 RegExpNode* on_success) { 245 RegExpNode* on_success) {
236 return new(CI) TextNode(this, on_success); 246 return new(CI) TextNode(this, on_success);
237 } 247 }
238 248
239 249
240 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 250 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
241 RegExpNode* on_success) { 251 RegExpNode* on_success) {
242 ZoneGrowableArray<RegExpTree*>* alternatives = this->alternatives(); 252 ZoneGrowableArray<RegExpTree*>* alternatives = this->alternatives();
243 intptr_t length = alternatives->length(); 253 intptr_t length = alternatives->length();
(...skipping 474 matching lines...) Expand 10 before | Expand all | Expand 10 after
718 // we don't need to check it. 728 // we don't need to check it.
719 if (neg_replacement == NULL) return set_replacement(replacement); 729 if (neg_replacement == NULL) return set_replacement(replacement);
720 alternatives_->At(0).set_node(neg_replacement); 730 alternatives_->At(0).set_node(neg_replacement);
721 return set_replacement(this); 731 return set_replacement(this);
722 } 732 }
723 733
724 734
725 // Code emission --------------------------------------------------------------- 735 // Code emission ---------------------------------------------------------------
726 736
727 737
728 #define DEFINE_EMIT(Type) \ 738 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
729 void Type##Node::Emit(RegExpCompiler* compiler, Trace* trace) { \ 739 Guard* guard,
730 UNIMPLEMENTED(); \ 740 Trace* trace) {
731 } 741 switch (guard->op()) {
732 FOR_EACH_NODE_TYPE(DEFINE_EMIT) 742 case Guard::LT:
733 DEFINE_EMIT(LoopChoice) 743 ASSERT(!trace->mentions_reg(guard->reg()));
734 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, 744 macro_assembler->IfRegisterGE(guard->reg(),
735 Trace* trace) { 745 guard->value(),
746 trace->backtrack());
747 break;
748 case Guard::GEQ:
749 ASSERT(!trace->mentions_reg(guard->reg()));
750 macro_assembler->IfRegisterLT(guard->reg(),
751 guard->value(),
752 trace->backtrack());
753 break;
754 }
755 }
756
757
758 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
759 RegExpMacroAssembler* assembler = compiler->macro_assembler();
760
761 // Omit flushing the trace. We discard the entire stack frame anyway.
762
763 if (!label()->IsBound()) {
764 // We are completely independent of the trace, since we ignore it,
765 // so this code can be used as the generic version.
766 assembler->BindBlock(label());
767 }
768
769 // Throw away everything on the backtrack stack since the start
770 // of the negative submatch and restore the character position.
771 assembler->ReadCurrentPositionFromRegister(current_position_register_);
772 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
773 if (clear_capture_count_ > 0) {
774 // Clear any captures that might have been performed during the success
775 // of the body of the negative look-ahead.
776 intptr_t clear_capture_end =
777 clear_capture_start_ + clear_capture_count_ - 1;
778 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
779 }
780 // Now that we have unwound the stack we find at the top of the stack the
781 // backtrack that the BeginSubmatch node got.
782 assembler->Backtrack();
783 }
784
785
786 bool Trace::GetStoredPosition(intptr_t reg, int* cp_offset) {
787 ASSERT(*cp_offset == 0);
788 for (DeferredAction* action = actions_;
789 action != NULL;
790 action = action->next()) {
791 if (action->Mentions(reg)) {
792 if (action->action_type() == ActionNode::STORE_POSITION) {
793 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
794 return true;
795 } else {
796 return false;
797 }
798 }
799 }
800 return false;
801 }
802
803
804 // This is called as we come into a loop choice node and some other tricky
805 // nodes. It normalizes the state of the code generator to ensure we can
806 // generate generic code.
807 intptr_t Trace::FindAffectedRegisters(OutSet* affected_registers,
808 Isolate* isolate) {
809 intptr_t max_register = RegExpCompiler::kNoRegister;
810 for (DeferredAction* action = actions_;
811 action != NULL;
812 action = action->next()) {
813 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
814 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
815 for (intptr_t i = range.from(); i <= range.to(); i++)
816 affected_registers->Set(i, isolate);
817 if (range.to() > max_register) max_register = range.to();
818 } else {
819 affected_registers->Set(action->reg(), isolate);
820 if (action->reg() > max_register) max_register = action->reg();
821 }
822 }
823 return max_register;
824 }
825
826
827 bool Trace::DeferredAction::Mentions(intptr_t that) {
828 if (action_type() == ActionNode::CLEAR_CAPTURES) {
829 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
830 return range.Contains(that);
831 } else {
832 return reg() == that;
833 }
834 }
835
836
837 bool Trace::mentions_reg(intptr_t reg) {
838 for (DeferredAction* action = actions_;
839 action != NULL;
840 action = action->next()) {
841 if (action->Mentions(reg))
842 return true;
843 }
844 return false;
845 }
846
847
848 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
849 intptr_t max_register,
850 const OutSet& registers_to_pop,
851 const OutSet& registers_to_clear) {
852 for (intptr_t reg = max_register; reg >= 0; reg--) {
853 if (registers_to_pop.Get(reg)) {
854 assembler->PopRegister(reg);
855 } else if (registers_to_clear.Get(reg)) {
856 intptr_t clear_to = reg;
857 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
858 reg--;
859 }
860 assembler->ClearRegisters(reg, clear_to);
861 }
862 }
863 }
864
865
866 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
867 intptr_t max_register,
868 const OutSet& affected_registers,
869 OutSet* registers_to_pop,
870 OutSet* registers_to_clear,
871 Isolate* isolate) {
872 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
873 const intptr_t push_limit = (assembler->stack_limit_slack() + 1) / 2;
874
875 // Count pushes performed to force a stack limit check occasionally.
876 intptr_t pushes = 0;
877
878 for (intptr_t reg = 0; reg <= max_register; reg++) {
879 if (!affected_registers.Get(reg)) {
880 continue;
881 }
882
883 // The chronologically first deferred action in the trace
884 // is used to infer the action needed to restore a register
885 // to its previous state (or not, if it's safe to ignore it).
886 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
887 DeferredActionUndoType undo_action = IGNORE;
888
889 intptr_t value = 0;
890 bool absolute = false;
891 bool clear = false;
892 intptr_t store_position = -1;
893 // This is a little tricky because we are scanning the actions in reverse
894 // historical order (newest first).
895 for (DeferredAction* action = actions_;
896 action != NULL;
897 action = action->next()) {
898 if (action->Mentions(reg)) {
899 switch (action->action_type()) {
900 case ActionNode::SET_REGISTER: {
901 Trace::DeferredSetRegister* psr =
902 static_cast<Trace::DeferredSetRegister*>(action);
903 if (!absolute) {
904 value += psr->value();
905 absolute = true;
906 }
907 // SET_REGISTER is currently only used for newly introduced loop
908 // counters. They can have a significant previous value if they
909 // occour in a loop. TODO(lrn): Propagate this information, so
910 // we can set undo_action to IGNORE if we know there is no value to
911 // restore.
912 undo_action = RESTORE;
913 ASSERT(store_position == -1);
914 ASSERT(!clear);
915 break;
916 }
917 case ActionNode::INCREMENT_REGISTER:
918 if (!absolute) {
919 value++;
920 }
921 ASSERT(store_position == -1);
922 ASSERT(!clear);
923 undo_action = RESTORE;
924 break;
925 case ActionNode::STORE_POSITION: {
926 Trace::DeferredCapture* pc =
927 static_cast<Trace::DeferredCapture*>(action);
928 if (!clear && store_position == -1) {
929 store_position = pc->cp_offset();
930 }
931
932 // For captures we know that stores and clears alternate.
933 // Other register, are never cleared, and if the occur
934 // inside a loop, they might be assigned more than once.
935 if (reg <= 1) {
936 // Registers zero and one, aka "capture zero", is
937 // always set correctly if we succeed. There is no
938 // need to undo a setting on backtrack, because we
939 // will set it again or fail.
940 undo_action = IGNORE;
941 } else {
942 undo_action = pc->is_capture() ? CLEAR : RESTORE;
943 }
944 ASSERT(!absolute);
945 ASSERT(value == 0);
946 break;
947 }
948 case ActionNode::CLEAR_CAPTURES: {
949 // Since we're scanning in reverse order, if we've already
950 // set the position we have to ignore historically earlier
951 // clearing operations.
952 if (store_position == -1) {
953 clear = true;
954 }
955 undo_action = RESTORE;
956 ASSERT(!absolute);
957 ASSERT(value == 0);
958 break;
959 }
960 default:
961 UNREACHABLE();
962 break;
963 }
964 }
965 }
966 // Prepare for the undo-action (e.g., push if it's going to be popped).
967 if (undo_action == RESTORE) {
968 pushes++;
969 RegExpMacroAssembler::StackCheckFlag stack_check =
970 RegExpMacroAssembler::kNoStackLimitCheck;
971 if (pushes == push_limit) {
972 stack_check = RegExpMacroAssembler::kCheckStackLimit;
973 pushes = 0;
974 }
975
976 assembler->PushRegister(reg, stack_check);
977 registers_to_pop->Set(reg, isolate);
978 } else if (undo_action == CLEAR) {
979 registers_to_clear->Set(reg, isolate);
980 }
981 // Perform the chronologically last action (or accumulated increment)
982 // for the register.
983 if (store_position != -1) {
984 assembler->WriteCurrentPositionToRegister(reg, store_position);
985 } else if (clear) {
986 assembler->ClearRegisters(reg, reg);
987 } else if (absolute) {
988 assembler->SetRegister(reg, value);
989 } else if (value != 0) {
990 assembler->AdvanceRegister(reg, value);
991 }
992 }
993 }
994
995
996 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
997 RegExpMacroAssembler* assembler = compiler->macro_assembler();
998
999 ASSERT(!is_trivial());
1000
1001 if (actions_ == NULL && backtrack() == NULL) {
1002 // Here we just have some deferred cp advances to fix and we are back to
1003 // a normal situation. We may also have to forget some information gained
1004 // through a quick check that was already performed.
1005 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1006 // Create a new trivial state and generate the node with that.
1007 Trace new_state;
1008 successor->Emit(compiler, &new_state);
1009 return;
1010 }
1011
1012 // Generate deferred actions here along with code to undo them again.
1013 OutSet affected_registers;
1014
1015 if (backtrack() != NULL) {
1016 // Here we have a concrete backtrack location. These are set up by choice
1017 // nodes and so they indicate that we have a deferred save of the current
1018 // position which we may need to emit here.
1019 assembler->PushCurrentPosition();
1020 }
1021
1022 intptr_t max_register = FindAffectedRegisters(&affected_registers, CI);
1023 OutSet registers_to_pop;
1024 OutSet registers_to_clear;
1025 PerformDeferredActions(assembler,
1026 max_register,
1027 affected_registers,
1028 &registers_to_pop,
1029 &registers_to_clear,
1030 CI);
1031 if (cp_offset_ != 0) {
1032 assembler->AdvanceCurrentPosition(cp_offset_);
1033 }
1034
1035 // Create a new trivial state and generate the node with that.
1036 BlockLabel undo;
1037 assembler->PushBacktrack(&undo);
1038 Trace new_state;
1039 successor->Emit(compiler, &new_state);
1040
1041 // On backtrack we need to restore state.
1042 assembler->BindBlock(&undo);
1043 RestoreAffectedRegisters(assembler,
1044 max_register,
1045 registers_to_pop,
1046 registers_to_clear);
1047 if (backtrack() == NULL) {
1048 assembler->Backtrack();
1049 } else {
1050 assembler->PopCurrentPosition();
1051 assembler->Jump(backtrack());
1052 }
1053 }
1054
1055
1056 void Trace::InvalidateCurrentCharacter() {
1057 characters_preloaded_ = 0;
1058 }
1059
1060
1061 void Trace::AdvanceCurrentPositionInTrace(intptr_t by,
1062 RegExpCompiler* compiler) {
1063 ASSERT(by > 0);
1064 // We don't have an instruction for shifting the current character register
1065 // down or for using a shifted value for anything so lets just forget that
1066 // we preloaded any characters into it.
1067 characters_preloaded_ = 0;
1068 // Adjust the offsets of the quick check performed information. This
1069 // information is used to find out what we already determined about the
1070 // characters by means of mask and compare.
1071 quick_check_performed_.Advance(by, compiler->ascii());
1072 cp_offset_ += by;
1073 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
1074 compiler->SetRegExpTooBig();
1075 cp_offset_ = 0;
1076 }
1077 bound_checked_up_to_ = Utils::Maximum(0, bound_checked_up_to_ - by);
1078 }
1079
1080
1081 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1082 if (!trace->is_trivial()) {
1083 trace->Flush(compiler, this);
1084 return;
1085 }
1086 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1087 if (!label()->IsBound()) {
1088 assembler->BindBlock(label());
1089 }
1090 switch (action_) {
1091 case ACCEPT:
1092 assembler->Succeed();
1093 return;
1094 case BACKTRACK:
1095 assembler->Jump(trace->backtrack());
1096 return;
1097 case NEGATIVE_SUBMATCH_SUCCESS:
1098 // This case is handled in a different virtual method.
1099 UNREACHABLE();
1100 }
736 UNIMPLEMENTED(); 1101 UNIMPLEMENTED();
737 } 1102 }
738 #undef DEFINE_EMIT 1103
739 1104
740 1105 bool QuickCheckDetails::Rationalize(bool asc) {
741 #define DEFINE_BMINFO(Type) \ 1106 bool found_useful_op = false;
742 void Type##Node::FillInBMInfo(intptr_t initial_offset, \ 1107 uint32_t char_mask;
743 intptr_t budget, \ 1108 if (asc) {
744 BoyerMooreLookahead* bm, \ 1109 char_mask = Symbols::kMaxOneCharCodeSymbol;
745 bool not_at_start) { \ 1110 } else {
746 UNIMPLEMENTED(); \ 1111 char_mask = Utf16::kMaxCodeUnit;
747 } 1112 }
748 FOR_EACH_NODE_TYPE(DEFINE_BMINFO) 1113 mask_ = 0;
749 DEFINE_BMINFO(LoopChoice) 1114 value_ = 0;
750 #undef DEFINE_BMINFO 1115 intptr_t char_shift = 0;
1116 for (intptr_t i = 0; i < characters_; i++) {
1117 Position* pos = &positions_[i];
1118 if ((pos->mask & Symbols::kMaxOneCharCodeSymbol) != 0) {
1119 found_useful_op = true;
1120 }
1121 mask_ |= (pos->mask & char_mask) << char_shift;
1122 value_ |= (pos->value & char_mask) << char_shift;
1123 char_shift += asc ? 8 : 16;
1124 }
1125 return found_useful_op;
1126 }
1127
1128
1129 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1130 Trace* trace,
1131 bool preload_has_checked_bounds,
1132 BlockLabel* on_possible_success,
1133 QuickCheckDetails* details,
1134 bool fall_through_on_failure) {
1135 if (details->characters() == 0) return false;
1136 GetQuickCheckDetails(
1137 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
1138 if (details->cannot_match()) return false;
1139 if (!details->Rationalize(compiler->ascii())) return false;
1140 ASSERT(details->characters() == 1 ||
1141 compiler->macro_assembler()->CanReadUnaligned());
1142 uint32_t mask = details->mask();
1143 uint32_t value = details->value();
1144
1145 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1146
1147 if (trace->characters_preloaded() != details->characters()) {
1148 assembler->LoadCurrentCharacter(trace->cp_offset(),
1149 trace->backtrack(),
1150 !preload_has_checked_bounds,
1151 details->characters());
1152 }
1153
1154
1155 bool need_mask = true;
1156
1157 if (details->characters() == 1) {
1158 // If number of characters preloaded is 1 then we used a byte or 16 bit
1159 // load so the value is already masked down.
1160 uint32_t char_mask;
1161 if (compiler->ascii()) {
1162 char_mask = Symbols::kMaxOneCharCodeSymbol;
1163 } else {
1164 char_mask = Utf16::kMaxCodeUnit;
1165 }
1166 if ((mask & char_mask) == char_mask) need_mask = false;
1167 mask &= char_mask;
1168 } else {
1169 // For 2-character preloads in ASCII mode or 1-character preloads in
1170 // TWO_BYTE mode we also use a 16 bit load with zero extend.
1171 if (details->characters() == 2 && compiler->ascii()) {
1172 if ((mask & 0xffff) == 0xffff) need_mask = false;
1173 } else if (details->characters() == 1 && !compiler->ascii()) {
1174 if ((mask & 0xffff) == 0xffff) need_mask = false;
1175 } else {
1176 if (mask == 0xffffffff) need_mask = false;
1177 }
1178 }
1179
1180 if (fall_through_on_failure) {
1181 if (need_mask) {
1182 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1183 } else {
1184 assembler->CheckCharacter(value, on_possible_success);
1185 }
1186 } else {
1187 if (need_mask) {
1188 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1189 } else {
1190 assembler->CheckNotCharacter(value, trace->backtrack());
1191 }
1192 }
1193 return true;
1194 }
1195
1196
1197 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
1198 // that matches newline or the start of input).
1199 static void EmitHat(RegExpCompiler* compiler,
1200 RegExpNode* on_success,
1201 Trace* trace) {
1202 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1203 // We will be loading the previous character into the current character
1204 // register.
1205 Trace new_trace(*trace);
1206 new_trace.InvalidateCurrentCharacter();
1207
1208 BlockLabel ok;
1209 if (new_trace.cp_offset() == 0) {
1210 // The start of input counts as a newline in this context, so skip to
1211 // ok if we are at the start.
1212 assembler->CheckAtStart(&ok);
1213 }
1214 // We already checked that we are not at the start of input so it must be
1215 // OK to load the previous character.
1216 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
1217 new_trace.backtrack(),
1218 false);
1219 if (!assembler->CheckSpecialCharacterClass('n',
1220 new_trace.backtrack())) {
1221 // Newline means \n, \r, 0x2028 or 0x2029.
1222 if (!compiler->ascii()) {
1223 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
1224 }
1225 assembler->CheckCharacter('\n', &ok);
1226 assembler->CheckNotCharacter('\r', new_trace.backtrack());
1227 }
1228 assembler->BindBlock(&ok);
1229 on_success->Emit(compiler, &new_trace);
1230 }
1231
1232
1233 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1234 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1235 switch (assertion_type_) {
1236 case AT_END: {
1237 BlockLabel ok;
1238 assembler->CheckPosition(trace->cp_offset(), &ok);
1239 assembler->Jump(trace->backtrack());
1240 assembler->BindBlock(&ok);
1241 break;
1242 }
1243 case AT_START: {
1244 if (trace->at_start() == Trace::FALSE_VALUE) {
1245 assembler->Jump(trace->backtrack());
1246 return;
1247 }
1248 if (trace->at_start() == Trace::UNKNOWN) {
1249 assembler->CheckNotAtStart(trace->backtrack());
1250 Trace at_start_trace = *trace;
1251 at_start_trace.set_at_start(true);
1252 on_success()->Emit(compiler, &at_start_trace);
1253 return;
1254 }
1255 }
1256 break;
1257 case AFTER_NEWLINE:
1258 EmitHat(compiler, on_success(), trace);
1259 return;
1260 case AT_BOUNDARY:
1261 case AT_NON_BOUNDARY: {
1262 EmitBoundaryCheck(compiler, trace);
1263 return;
1264 }
1265 }
1266 on_success()->Emit(compiler, trace);
1267 }
1268
1269
1270 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1271 Trace* trace) {
1272 // If we are generating a greedy loop then don't stop and don't reuse code.
1273 if (trace->stop_node() != NULL) {
1274 return CONTINUE;
1275 }
1276
1277 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1278 if (trace->is_trivial()) {
1279 if (label_.IsBound()) {
1280 // We are being asked to generate a generic version, but that's already
1281 // been done so just go to it.
1282 macro_assembler->Jump(&label_);
1283 return DONE;
1284 }
1285 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1286 // To avoid too deep recursion we push the node to the work queue and just
1287 // generate a goto here.
1288 compiler->AddWork(this);
1289 macro_assembler->Jump(&label_);
1290 return DONE;
1291 }
1292 // Generate generic version of the node and bind the label for later use.
1293 macro_assembler->BindBlock(&label_);
1294 return CONTINUE;
1295 }
1296
1297 // We are being asked to make a non-generic version. Keep track of how many
1298 // non-generic versions we generate so as not to overdo it.
1299 trace_count_++;
1300 if (kRegexpOptimization &&
1301 trace_count_ < kMaxCopiesCodeGenerated &&
1302 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1303 return CONTINUE;
1304 }
1305
1306 // If we get here code has been generated for this node too many times or
1307 // recursion is too deep. Time to switch to a generic version. The code for
1308 // generic versions above can handle deep recursion properly.
1309 trace->Flush(compiler, this);
1310 return DONE;
1311 }
1312
1313
1314 // This generates the code to match a text node. A text node can contain
1315 // straight character sequences (possibly to be matched in a case-independent
1316 // way) and character classes. For efficiency we do not do this in a single
1317 // pass from left to right. Instead we pass over the text node several times,
1318 // emitting code for some character positions every time. See the comment on
1319 // TextEmitPass for details.
1320 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1321 LimitResult limit_result = LimitVersions(compiler, trace);
1322 if (limit_result == DONE) return;
1323 ASSERT(limit_result == CONTINUE);
1324
1325 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
1326 compiler->SetRegExpTooBig();
1327 return;
1328 }
1329
1330 if (compiler->ascii()) {
1331 intptr_t dummy = 0;
1332 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
1333 }
1334
1335 bool first_elt_done = false;
1336 intptr_t bound_checked_to = trace->cp_offset() - 1;
1337 bound_checked_to += trace->bound_checked_up_to();
1338
1339 // If a character is preloaded into the current character register then
1340 // check that now.
1341 if (trace->characters_preloaded() == 1) {
1342 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {
1343 if (!SkipPass(pass, compiler->ignore_case())) {
1344 TextEmitPass(compiler,
1345 static_cast<TextEmitPassType>(pass),
1346 true,
1347 trace,
1348 false,
1349 &bound_checked_to);
1350 }
1351 }
1352 first_elt_done = true;
1353 }
1354
1355 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {
1356 if (!SkipPass(pass, compiler->ignore_case())) {
1357 TextEmitPass(compiler,
1358 static_cast<TextEmitPassType>(pass),
1359 false,
1360 trace,
1361 first_elt_done,
1362 &bound_checked_to);
1363 }
1364 }
1365
1366 Trace successor_trace(*trace);
1367 successor_trace.set_at_start(false);
1368 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
1369 RecursionCheck rc(compiler);
1370 on_success()->Emit(compiler, &successor_trace);
1371 }
1372
1373
1374 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1375 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1376 if (trace->stop_node() == this) {
1377 intptr_t text_length =
1378 GreedyLoopTextLengthForAlternative(&(alternatives_->At(0)));
1379 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
1380 // Update the counter-based backtracking info on the stack. This is an
1381 // optimization for greedy loops (see below).
1382 ASSERT(trace->cp_offset() == text_length);
1383 macro_assembler->AdvanceCurrentPosition(text_length);
1384 macro_assembler->Jump(trace->loop_label());
1385 return;
1386 }
1387 ASSERT(trace->stop_node() == NULL);
1388 if (!trace->is_trivial()) {
1389 trace->Flush(compiler, this);
1390 return;
1391 }
1392 ChoiceNode::Emit(compiler, trace);
1393 }
1394
1395
1396 // This class is used when generating the alternatives in a choice node. It
1397 // records the way the alternative is being code generated.
1398 class AlternativeGeneration {
1399 public:
1400 AlternativeGeneration()
1401 : possible_success(),
1402 expects_preload(false),
1403 after(),
1404 quick_check_details() { }
1405 BlockLabel possible_success;
1406 bool expects_preload;
1407 BlockLabel after;
1408 QuickCheckDetails quick_check_details;
1409 };
1410
1411
1412 // Creates a list of AlternativeGenerations. If the list has a reasonable
1413 // size then it is on the stack, otherwise the excess is on the heap.
1414 class AlternativeGenerationList {
1415 public:
1416 explicit AlternativeGenerationList(intptr_t count)
1417 : alt_gens_(count) {
1418 for (intptr_t i = 0; i < count && i < kAFew; i++) {
1419 alt_gens_.Add(a_few_alt_gens_ + i);
1420 }
1421 for (intptr_t i = kAFew; i < count; i++) {
1422 alt_gens_.Add(new AlternativeGeneration());
1423 }
1424 }
1425 ~AlternativeGenerationList() {
1426 for (intptr_t i = kAFew; i < alt_gens_.length(); i++) {
1427 delete alt_gens_[i];
1428 alt_gens_[i] = NULL;
1429 }
1430 }
1431
1432 AlternativeGeneration* at(intptr_t i) {
1433 return alt_gens_[i];
1434 }
1435
1436 private:
1437 static const intptr_t kAFew = 10;
1438 GrowableArray<AlternativeGeneration*> alt_gens_;
1439 AlternativeGeneration a_few_alt_gens_[kAFew];
1440 };
1441
1442
1443 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1444 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1445 intptr_t choice_count = alternatives_->length();
1446
1447 #ifdef DEBUG
1448 for (intptr_t i = 0; i < choice_count - 1; i++) {
1449 GuardedAlternative alternative = alternatives_->At(i);
1450 ZoneGrowableArray<Guard*>* guards = alternative.guards();
1451 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();
1452 for (intptr_t j = 0; j < guard_count; j++) {
1453 ASSERT(!trace->mentions_reg(guards->At(j)->reg()));
1454 }
1455 }
1456 #endif
1457
1458 LimitResult limit_result = LimitVersions(compiler, trace);
1459 if (limit_result == DONE) return;
1460 ASSERT(limit_result == CONTINUE);
1461
1462 intptr_t new_flush_budget = trace->flush_budget() / choice_count;
1463 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
1464 trace->Flush(compiler, this);
1465 return;
1466 }
1467
1468 RecursionCheck rc(compiler);
1469
1470 Trace* current_trace = trace;
1471
1472 intptr_t text_length =
1473 GreedyLoopTextLengthForAlternative(&(alternatives_->At(0)));
1474 bool greedy_loop = false;
1475 BlockLabel greedy_loop_label;
1476 Trace counter_backtrack_trace;
1477 counter_backtrack_trace.set_backtrack(&greedy_loop_label);
1478 if (not_at_start()) counter_backtrack_trace.set_at_start(false);
1479
1480 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
1481 // Here we have special handling for greedy loops containing only text nodes
1482 // and other simple nodes. These are handled by pushing the current
1483 // position on the stack and then incrementing the current position each
1484 // time around the switch. On backtrack we decrement the current position
1485 // and check it against the pushed value. This avoids pushing backtrack
1486 // information for each iteration of the loop, which could take up a lot of
1487 // space.
1488 greedy_loop = true;
1489 ASSERT(trace->stop_node() == NULL);
1490 macro_assembler->PushCurrentPosition();
1491 current_trace = &counter_backtrack_trace;
1492 BlockLabel greedy_match_failed;
1493 Trace greedy_match_trace;
1494 if (not_at_start()) greedy_match_trace.set_at_start(false);
1495 greedy_match_trace.set_backtrack(&greedy_match_failed);
1496 BlockLabel loop_label;
1497 macro_assembler->BindBlock(&loop_label);
1498 greedy_match_trace.set_stop_node(this);
1499 greedy_match_trace.set_loop_label(&loop_label);
1500 alternatives_->At(0).node()->Emit(compiler, &greedy_match_trace);
1501 macro_assembler->BindBlock(&greedy_match_failed);
1502 }
1503
1504 BlockLabel second_choice; // For use in greedy matches.
1505 macro_assembler->BindBlock(&second_choice);
1506
1507 intptr_t first_normal_choice = greedy_loop ? 1 : 0;
1508
1509 bool not_at_start = current_trace->at_start() == Trace::FALSE_VALUE;
1510 const intptr_t kEatsAtLeastNotYetInitialized = -1;
1511 intptr_t eats_at_least = kEatsAtLeastNotYetInitialized;
1512
1513 bool skip_was_emitted = false;
1514
1515 if (!greedy_loop && choice_count == 2) {
1516 GuardedAlternative alt1 = alternatives_->At(1);
1517 if (alt1.guards() == NULL || alt1.guards()->length() == 0) {
1518 RegExpNode* eats_anything_node = alt1.node();
1519 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==
1520 this) {
1521 // At this point we know that we are at a non-greedy loop that will eat
1522 // any character one at a time. Any non-anchored regexp has such a
1523 // loop prepended to it in order to find where it starts. We look for
1524 // a pattern of the form ...abc... where we can look 6 characters ahead
1525 // and step forwards 3 if the character is not one of abc. Abc need
1526 // not be atoms, they can be any reasonably limited character class or
1527 // small alternation.
1528 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.
1529 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
1530 if (lookahead == NULL) {
1531 eats_at_least = Utils::Minimum(kMaxLookaheadForBoyerMoore,
1532 EatsAtLeast(kMaxLookaheadForBoyerMoore,
1533 kRecursionBudget,
1534 not_at_start));
1535 if (eats_at_least >= 1) {
1536 BoyerMooreLookahead* bm =
1537 new(I) BoyerMooreLookahead(eats_at_least, compiler, I);
1538 GuardedAlternative alt0 = alternatives_->At(0);
1539 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
1540 skip_was_emitted = bm->EmitSkipInstructions(macro_assembler);
1541 }
1542 } else {
1543 skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler);
1544 }
1545 }
1546 }
1547 }
1548
1549 if (eats_at_least == kEatsAtLeastNotYetInitialized) {
1550 // Save some time by looking at most one machine word ahead.
1551 eats_at_least =
1552 EatsAtLeast(compiler->ascii() ? 4 : 2, kRecursionBudget, not_at_start);
1553 }
1554 intptr_t preload_characters =
1555 CalculatePreloadCharacters(compiler, eats_at_least);
1556
1557 bool preload_is_current = !skip_was_emitted &&
1558 (current_trace->characters_preloaded() == preload_characters);
1559 bool preload_has_checked_bounds = preload_is_current;
1560
1561 AlternativeGenerationList alt_gens(choice_count);
1562
1563 // For now we just call all choices one after the other. The idea ultimately
1564 // is to use the Dispatch table to try only the relevant ones.
1565 for (intptr_t i = first_normal_choice; i < choice_count; i++) {
1566 GuardedAlternative alternative = alternatives_->At(i);
1567 AlternativeGeneration* alt_gen = alt_gens.at(i);
1568 alt_gen->quick_check_details.set_characters(preload_characters);
1569 ZoneGrowableArray<Guard*>* guards = alternative.guards();
1570 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();
1571 Trace new_trace(*current_trace);
1572 new_trace.set_characters_preloaded(preload_is_current ?
1573 preload_characters :
1574 0);
1575 if (preload_has_checked_bounds) {
1576 new_trace.set_bound_checked_up_to(preload_characters);
1577 }
1578 new_trace.quick_check_performed()->Clear();
1579 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
1580 alt_gen->expects_preload = preload_is_current;
1581 bool generate_full_check_inline = false;
1582 if (kRegexpOptimization &&
1583 try_to_emit_quick_check_for_alternative(i) &&
1584 alternative.node()->EmitQuickCheck(compiler,
1585 &new_trace,
1586 preload_has_checked_bounds,
1587 &alt_gen->possible_success,
1588 &alt_gen->quick_check_details,
1589 i < choice_count - 1)) {
1590 // Quick check was generated for this choice.
1591 preload_is_current = true;
1592 preload_has_checked_bounds = true;
1593 // On the last choice in the ChoiceNode we generated the quick
1594 // check to fall through on possible success. So now we need to
1595 // generate the full check inline.
1596 if (i == choice_count - 1) {
1597 macro_assembler->BindBlock(&alt_gen->possible_success);
1598 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
1599 new_trace.set_characters_preloaded(preload_characters);
1600 new_trace.set_bound_checked_up_to(preload_characters);
1601 generate_full_check_inline = true;
1602 }
1603 } else if (alt_gen->quick_check_details.cannot_match()) {
1604 if (i == choice_count - 1 && !greedy_loop) {
1605 macro_assembler->GoTo(trace->backtrack());
1606 }
1607 continue;
1608 } else {
1609 // No quick check was generated. Put the full code here.
1610 // If this is not the first choice then there could be slow checks from
1611 // previous cases that go here when they fail. There's no reason to
1612 // insist that they preload characters since the slow check we are about
1613 // to generate probably can't use it.
1614 if (i != first_normal_choice) {
1615 alt_gen->expects_preload = false;
1616 new_trace.InvalidateCurrentCharacter();
1617 }
1618 if (i < choice_count - 1) {
1619 new_trace.set_backtrack(&alt_gen->after);
1620 }
1621 generate_full_check_inline = true;
1622 }
1623 if (generate_full_check_inline) {
1624 if (new_trace.actions() != NULL) {
1625 new_trace.set_flush_budget(new_flush_budget);
1626 }
1627 for (intptr_t j = 0; j < guard_count; j++) {
1628 GenerateGuard(macro_assembler, guards->At(j), &new_trace);
1629 }
1630 alternative.node()->Emit(compiler, &new_trace);
1631 preload_is_current = false;
1632 }
1633 macro_assembler->BindBlock(&alt_gen->after);
1634 }
1635 if (greedy_loop) {
1636 macro_assembler->BindBlock(&greedy_loop_label);
1637 // If we have unwound to the bottom then backtrack.
1638 macro_assembler->CheckGreedyLoop(trace->backtrack());
1639 // Otherwise try the second priority at an earlier position.
1640 macro_assembler->AdvanceCurrentPosition(-text_length);
1641 macro_assembler->Jump(&second_choice);
1642 }
1643
1644 // At this point we need to generate slow checks for the alternatives where
1645 // the quick check was inlined. We can recognize these because the associated
1646 // label was bound.
1647 for (intptr_t i = first_normal_choice; i < choice_count - 1; i++) {
1648 AlternativeGeneration* alt_gen = alt_gens.at(i);
1649 Trace new_trace(*current_trace);
1650 // If there are actions to be flushed we have to limit how many times
1651 // they are flushed. Take the budget of the parent trace and distribute
1652 // it fairly amongst the children.
1653 if (new_trace.actions() != NULL) {
1654 new_trace.set_flush_budget(new_flush_budget);
1655 }
1656 EmitOutOfLineContinuation(compiler,
1657 &new_trace,
1658 alternatives_->At(i),
1659 alt_gen,
1660 preload_characters,
1661 alt_gens.at(i + 1)->expects_preload);
1662 }
1663 }
1664
1665
1666 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
1667 Trace* trace,
1668 GuardedAlternative alternative,
1669 AlternativeGeneration* alt_gen,
1670 intptr_t preload_characters,
1671 bool next_expects_preload) {
1672 if (!alt_gen->possible_success.IsLinked()) return;
1673
1674 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1675 macro_assembler->BindBlock(&alt_gen->possible_success);
1676 Trace out_of_line_trace(*trace);
1677 out_of_line_trace.set_characters_preloaded(preload_characters);
1678 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
1679 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
1680 ZoneGrowableArray<Guard*>* guards = alternative.guards();
1681 intptr_t guard_count = (guards == NULL) ? 0 : guards->length();
1682 if (next_expects_preload) {
1683 BlockLabel reload_current_char;
1684 out_of_line_trace.set_backtrack(&reload_current_char);
1685 for (intptr_t j = 0; j < guard_count; j++) {
1686 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);
1687 }
1688 alternative.node()->Emit(compiler, &out_of_line_trace);
1689 macro_assembler->BindBlock(&reload_current_char);
1690 // Reload the current character, since the next quick check expects that.
1691 // We don't need to check bounds here because we only get into this
1692 // code through a quick check which already did the checked load.
1693 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
1694 NULL,
1695 false,
1696 preload_characters);
1697 macro_assembler->Jump(&(alt_gen->after));
1698 } else {
1699 out_of_line_trace.set_backtrack(&(alt_gen->after));
1700 for (intptr_t j = 0; j < guard_count; j++) {
1701 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);
1702 }
1703 alternative.node()->Emit(compiler, &out_of_line_trace);
1704 }
1705 }
1706
1707
1708 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1709 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1710 LimitResult limit_result = LimitVersions(compiler, trace);
1711 if (limit_result == DONE) return;
1712 ASSERT(limit_result == CONTINUE);
1713
1714 RecursionCheck rc(compiler);
1715
1716 switch (action_type_) {
1717 case STORE_POSITION: {
1718 Trace::DeferredCapture
1719 new_capture(data_.u_position_register.reg,
1720 data_.u_position_register.is_capture,
1721 trace);
1722 Trace new_trace = *trace;
1723 new_trace.add_action(&new_capture);
1724 on_success()->Emit(compiler, &new_trace);
1725 break;
1726 }
1727 case INCREMENT_REGISTER: {
1728 Trace::DeferredIncrementRegister
1729 new_increment(data_.u_increment_register.reg);
1730 Trace new_trace = *trace;
1731 new_trace.add_action(&new_increment);
1732 on_success()->Emit(compiler, &new_trace);
1733 break;
1734 }
1735 case SET_REGISTER: {
1736 Trace::DeferredSetRegister
1737 new_set(data_.u_store_register.reg, data_.u_store_register.value);
1738 Trace new_trace = *trace;
1739 new_trace.add_action(&new_set);
1740 on_success()->Emit(compiler, &new_trace);
1741 break;
1742 }
1743 case CLEAR_CAPTURES: {
1744 Trace::DeferredClearCaptures
1745 new_capture(Interval(data_.u_clear_captures.range_from,
1746 data_.u_clear_captures.range_to));
1747 Trace new_trace = *trace;
1748 new_trace.add_action(&new_capture);
1749 on_success()->Emit(compiler, &new_trace);
1750 break;
1751 }
1752 case BEGIN_SUBMATCH:
1753 if (!trace->is_trivial()) {
1754 trace->Flush(compiler, this);
1755 } else {
1756 assembler->WriteCurrentPositionToRegister(
1757 data_.u_submatch.current_position_register, 0);
1758 assembler->WriteStackPointerToRegister(
1759 data_.u_submatch.stack_pointer_register);
1760 on_success()->Emit(compiler, trace);
1761 }
1762 break;
1763 case EMPTY_MATCH_CHECK: {
1764 intptr_t start_pos_reg = data_.u_empty_match_check.start_register;
1765 intptr_t stored_pos = 0;
1766 intptr_t rep_reg = data_.u_empty_match_check.repetition_register;
1767 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
1768 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
1769 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
1770 // If we know we haven't advanced and there is no minimum we
1771 // can just backtrack immediately.
1772 assembler->GoTo(trace->backtrack());
1773 } else if (know_dist && stored_pos < trace->cp_offset()) {
1774 // If we know we've advanced we can generate the continuation
1775 // immediately.
1776 on_success()->Emit(compiler, trace);
1777 } else if (!trace->is_trivial()) {
1778 trace->Flush(compiler, this);
1779 } else {
1780 BlockLabel skip_empty_check;
1781 // If we have a minimum number of repetitions we check the current
1782 // number first and skip the empty check if it's not enough.
1783 if (has_minimum) {
1784 intptr_t limit = data_.u_empty_match_check.repetition_limit;
1785 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
1786 }
1787 // If the match is empty we bail out, otherwise we fall through
1788 // to the on-success continuation.
1789 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
1790 trace->backtrack());
1791 assembler->BindBlock(&skip_empty_check);
1792 on_success()->Emit(compiler, trace);
1793 }
1794 break;
1795 }
1796 case POSITIVE_SUBMATCH_SUCCESS: {
1797 if (!trace->is_trivial()) {
1798 trace->Flush(compiler, this);
1799 return;
1800 }
1801 assembler->ReadCurrentPositionFromRegister(
1802 data_.u_submatch.current_position_register);
1803 assembler->ReadStackPointerFromRegister(
1804 data_.u_submatch.stack_pointer_register);
1805 intptr_t clear_register_count = data_.u_submatch.clear_register_count;
1806 if (clear_register_count == 0) {
1807 on_success()->Emit(compiler, trace);
1808 return;
1809 }
1810 intptr_t clear_registers_from = data_.u_submatch.clear_register_from;
1811 BlockLabel clear_registers_backtrack;
1812 Trace new_trace = *trace;
1813 new_trace.set_backtrack(&clear_registers_backtrack);
1814 on_success()->Emit(compiler, &new_trace);
1815
1816 assembler->BindBlock(&clear_registers_backtrack);
1817 intptr_t clear_registers_to =
1818 clear_registers_from + clear_register_count - 1;
1819 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
1820
1821 ASSERT(trace->backtrack() == NULL);
1822 assembler->Backtrack();
1823 return;
1824 }
1825 default:
1826 UNREACHABLE();
1827 }
1828 }
1829
1830
1831 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1832 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1833 if (!trace->is_trivial()) {
1834 trace->Flush(compiler, this);
1835 return;
1836 }
1837
1838 LimitResult limit_result = LimitVersions(compiler, trace);
1839 if (limit_result == DONE) return;
1840 ASSERT(limit_result == CONTINUE);
1841
1842 RecursionCheck rc(compiler);
1843
1844 ASSERT(start_reg_ + 1 == end_reg_);
1845 if (compiler->ignore_case()) {
1846 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
1847 trace->backtrack());
1848 } else {
1849 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
1850 }
1851 on_success()->Emit(compiler, trace);
1852 }
1853
1854
1855 void ActionNode::FillInBMInfo(intptr_t offset,
1856 intptr_t budget,
1857 BoyerMooreLookahead* bm,
1858 bool not_at_start) {
1859 if (action_type_ == BEGIN_SUBMATCH) {
1860 bm->SetRest(offset);
1861 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
1862 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
1863 }
1864 SaveBMInfo(bm, not_at_start, offset);
1865 }
1866
1867
1868 void AssertionNode::FillInBMInfo(intptr_t offset,
1869 intptr_t budget,
1870 BoyerMooreLookahead* bm,
1871 bool not_at_start) {
1872 // Match the behaviour of EatsAtLeast on this node.
1873 if (assertion_type() == AT_START && not_at_start) return;
1874 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
1875 SaveBMInfo(bm, not_at_start, offset);
1876 }
1877
1878
1879 void BackReferenceNode::FillInBMInfo(intptr_t offset,
1880 intptr_t budget,
1881 BoyerMooreLookahead* bm,
1882 bool not_at_start) {
1883 // Working out the set of characters that a backreference can match is too
1884 // hard, so we just say that any character can match.
1885 bm->SetRest(offset);
1886 SaveBMInfo(bm, not_at_start, offset);
1887 }
1888
1889
1890 // Returns the number of characters in the equivalence class, omitting those
1891 // that cannot occur in the source string because it is ASCII.
1892 static intptr_t GetCaseIndependentLetters(uint16_t character,
1893 bool ascii_subject,
1894 unibrow::uchar* letters) {
1895 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize;
1896 intptr_t length = jsregexp_uncanonicalize.get(character, '\0', letters);
1897 // Unibrow returns 0 or 1 for characters where case independence is
1898 // trivial.
1899 if (length == 0) {
1900 letters[0] = character;
1901 length = 1;
1902 }
1903 if (!ascii_subject || character <= Symbols::kMaxOneCharCodeSymbol) {
1904 return length;
1905 }
1906 // The standard requires that non-ASCII characters cannot have ASCII
1907 // character codes in their equivalence class.
1908 return 0;
1909 }
1910
1911
1912 void ChoiceNode::FillInBMInfo(intptr_t offset,
1913 intptr_t budget,
1914 BoyerMooreLookahead* bm,
1915 bool not_at_start) {
1916 ZoneGrowableArray<GuardedAlternative>* alts = alternatives();
1917 budget = (budget - 1) / alts->length();
1918 for (intptr_t i = 0; i < alts->length(); i++) {
1919 GuardedAlternative& alt = alts->At(i);
1920 if (alt.guards() != NULL && alt.guards()->length() != 0) {
1921 bm->SetRest(offset); // Give up trying to fill in info.
1922 SaveBMInfo(bm, not_at_start, offset);
1923 return;
1924 }
1925 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start);
1926 }
1927 SaveBMInfo(bm, not_at_start, offset);
1928 }
1929
1930
1931 void EndNode::FillInBMInfo(intptr_t offset,
1932 intptr_t budget,
1933 BoyerMooreLookahead* bm,
1934 bool not_at_start) {
1935 // Returning 0 from EatsAtLeast should ensure we never get here.
1936 UNREACHABLE();
1937 }
1938
1939
1940 void LoopChoiceNode::FillInBMInfo(intptr_t offset,
1941 intptr_t budget,
1942 BoyerMooreLookahead* bm,
1943 bool not_at_start) {
1944 if (body_can_be_zero_length_ || budget <= 0) {
1945 bm->SetRest(offset);
1946 SaveBMInfo(bm, not_at_start, offset);
1947 return;
1948 }
1949 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start);
1950 SaveBMInfo(bm, not_at_start, offset);
1951 }
1952
1953
1954 void TextNode::FillInBMInfo(intptr_t initial_offset,
1955 intptr_t budget,
1956 BoyerMooreLookahead* bm,
1957 bool not_at_start) {
1958 if (initial_offset >= bm->length()) return;
1959 intptr_t offset = initial_offset;
1960 intptr_t max_char = bm->max_char();
1961 for (intptr_t i = 0; i < elements()->length(); i++) {
1962 if (offset >= bm->length()) {
1963 if (initial_offset == 0) set_bm_info(not_at_start, bm);
1964 return;
1965 }
1966 TextElement text = elements()->At(i);
1967 if (text.text_type() == TextElement::ATOM) {
1968 RegExpAtom* atom = text.atom();
1969 for (intptr_t j = 0; j < atom->length(); j++, offset++) {
1970 if (offset >= bm->length()) {
1971 if (initial_offset == 0) set_bm_info(not_at_start, bm);
1972 return;
1973 }
1974 uint16_t character = atom->data()->At(j);
1975 if (bm->compiler()->ignore_case()) {
1976 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1977 intptr_t length = GetCaseIndependentLetters(
1978 character,
1979 bm->max_char() == Symbols::kMaxOneCharCodeSymbol,
1980 chars);
1981 for (intptr_t j = 0; j < length; j++) {
1982 bm->Set(offset, chars[j]);
1983 }
1984 } else {
1985 if (character <= max_char) bm->Set(offset, character);
1986 }
1987 }
1988 } else {
1989 ASSERT(text.text_type() == TextElement::CHAR_CLASS);
1990 RegExpCharacterClass* char_class = text.char_class();
1991 ZoneGrowableArray<CharacterRange>* ranges = char_class->ranges();
1992 if (char_class->is_negated()) {
1993 bm->SetAll(offset);
1994 } else {
1995 for (intptr_t k = 0; k < ranges->length(); k++) {
1996 CharacterRange& range = ranges->At(k);
1997 if (range.from() > max_char) continue;
1998 intptr_t to = Utils::Minimum(max_char, static_cast<int>(range.to()));
1999 bm->SetInterval(offset, Interval(range.from(), to));
2000 }
2001 }
2002 offset++;
2003 }
2004 }
2005 if (offset >= bm->length()) {
2006 if (initial_offset == 0) set_bm_info(not_at_start, bm);
2007 return;
2008 }
2009 on_success()->FillInBMInfo(offset,
2010 budget - 1,
2011 bm,
2012 true); // Not at start after a text node.
2013 if (initial_offset == 0) set_bm_info(not_at_start, bm);
2014 }
2015
2016
2017 // Check for [0-9A-Z_a-z].
2018 static void EmitWordCheck(RegExpMacroAssembler* assembler,
2019 BlockLabel* word,
2020 BlockLabel* non_word,
2021 bool fall_through_on_word) {
2022 if (assembler->CheckSpecialCharacterClass(
2023 fall_through_on_word ? 'w' : 'W',
2024 fall_through_on_word ? non_word : word)) {
2025 // Optimized implementation available.
2026 return;
2027 }
2028 assembler->CheckCharacterGT('z', non_word);
2029 assembler->CheckCharacterLT('0', non_word);
2030 assembler->CheckCharacterGT('a' - 1, word);
2031 assembler->CheckCharacterLT('9' + 1, word);
2032 assembler->CheckCharacterLT('A', non_word);
2033 assembler->CheckCharacterLT('Z' + 1, word);
2034 if (fall_through_on_word) {
2035 assembler->CheckNotCharacter('_', non_word);
2036 } else {
2037 assembler->CheckCharacter('_', word);
2038 }
2039 }
751 2040
752 2041
753 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 2042 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
754 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { 2043 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
755 UNIMPLEMENTED(); 2044 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2045 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2046 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2047 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2048 if (lookahead == NULL) {
2049 intptr_t eats_at_least =
2050 Utils::Minimum(kMaxLookaheadForBoyerMoore,
2051 EatsAtLeast(kMaxLookaheadForBoyerMoore,
2052 kRecursionBudget,
2053 not_at_start));
2054 if (eats_at_least >= 1) {
2055 BoyerMooreLookahead* bm =
2056 new(I) BoyerMooreLookahead(eats_at_least, compiler, I);
2057 FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
2058 if (bm->at(0)->is_non_word())
2059 next_is_word_character = Trace::FALSE_VALUE;
2060 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2061 }
2062 } else {
2063 if (lookahead->at(0)->is_non_word())
2064 next_is_word_character = Trace::FALSE_VALUE;
2065 if (lookahead->at(0)->is_word())
2066 next_is_word_character = Trace::TRUE_VALUE;
2067 }
2068 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
2069 if (next_is_word_character == Trace::UNKNOWN) {
2070 BlockLabel before_non_word;
2071 BlockLabel before_word;
2072 if (trace->characters_preloaded() != 1) {
2073 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2074 }
2075 // Fall through on non-word.
2076 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2077 // Next character is not a word character.
2078 assembler->BindBlock(&before_non_word);
2079 BlockLabel ok;
2080 // Backtrack on \B (non-boundary check) if previous is a word,
2081 // since we know next *is not* a word and this would be a boundary.
2082 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2083
2084 if (!assembler->IsClosed()) {
2085 assembler->Jump(&ok);
2086 }
2087
2088 assembler->BindBlock(&before_word);
2089 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2090 assembler->BindBlock(&ok);
2091 } else if (next_is_word_character == Trace::TRUE_VALUE) {
2092 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2093 } else {
2094 ASSERT(next_is_word_character == Trace::FALSE_VALUE);
2095 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2096 }
756 } 2097 }
757 2098
758 2099
759 void AssertionNode::BacktrackIfPrevious( 2100 void AssertionNode::BacktrackIfPrevious(
760 RegExpCompiler* compiler, 2101 RegExpCompiler* compiler,
761 Trace* trace, 2102 Trace* trace,
762 AssertionNode::IfPrevious backtrack_if_previous) { 2103 AssertionNode::IfPrevious backtrack_if_previous) {
763 UNIMPLEMENTED(); 2104 RegExpMacroAssembler* assembler = compiler->macro_assembler();
764 } 2105 Trace new_trace(*trace);
765 2106 new_trace.InvalidateCurrentCharacter();
766 2107
2108 BlockLabel fall_through, dummy;
2109
2110 BlockLabel* non_word = backtrack_if_previous == kIsNonWord ?
2111 new_trace.backtrack() :
2112 &fall_through;
2113 BlockLabel* word = backtrack_if_previous == kIsNonWord ?
2114 &fall_through :
2115 new_trace.backtrack();
2116
2117 if (new_trace.cp_offset() == 0) {
2118 // The start of input counts as a non-word character, so the question is
2119 // decided if we are at the start.
2120 assembler->CheckAtStart(non_word);
2121 }
2122 // We already checked that we are not at the start of input so it must be
2123 // OK to load the previous character.
2124 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
2125 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
2126
2127 assembler->BindBlock(&fall_through);
2128 on_success()->Emit(compiler, &new_trace);
2129 }
2130
2131
2132 static bool DeterminedAlready(QuickCheckDetails* quick_check, intptr_t offset) {
2133 if (quick_check == NULL) return false;
2134 if (offset >= quick_check->characters()) return false;
2135 return quick_check->positions(offset)->determines_perfectly;
2136 }
2137
2138
2139 static void UpdateBoundsCheck(intptr_t index, int* checked_up_to) {
2140 if (index > *checked_up_to) {
2141 *checked_up_to = index;
2142 }
2143 }
2144
2145
2146 typedef bool EmitCharacterFunction(Isolate* isolate,
2147 RegExpCompiler* compiler,
2148 uint16_t c,
2149 BlockLabel* on_failure,
2150 intptr_t cp_offset,
2151 bool check,
2152 bool preloaded);
2153
2154
2155 static inline bool EmitSimpleCharacter(Isolate* isolate,
2156 RegExpCompiler* compiler,
2157 uint16_t c,
2158 BlockLabel* on_failure,
2159 intptr_t cp_offset,
2160 bool check,
2161 bool preloaded) {
2162 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2163 bool bound_checked = false;
2164 if (!preloaded) {
2165 assembler->LoadCurrentCharacter(
2166 cp_offset,
2167 on_failure,
2168 check);
2169 bound_checked = true;
2170 }
2171 assembler->CheckNotCharacter(c, on_failure);
2172 return bound_checked;
2173 }
2174
2175
2176 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
2177 bool ascii,
2178 uint16_t c1,
2179 uint16_t c2,
2180 BlockLabel* on_failure) {
2181 uint16_t char_mask;
2182 if (ascii) {
2183 char_mask = Symbols::kMaxOneCharCodeSymbol;
2184 } else {
2185 char_mask = Utf16::kMaxCodeUnit;
2186 }
2187 uint16_t exor = c1 ^ c2;
2188 // Check whether exor has only one bit set.
2189 if (((exor - 1) & exor) == 0) {
2190 // If c1 and c2 differ only by one bit.
2191 // Ecma262UnCanonicalize always gives the highest number last.
2192 ASSERT(c2 > c1);
2193 uint16_t mask = char_mask ^ exor;
2194 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
2195 return true;
2196 }
2197 ASSERT(c2 > c1);
2198 uint16_t diff = c2 - c1;
2199 if (((diff - 1) & diff) == 0 && c1 >= diff) {
2200 // If the characters differ by 2^n but don't differ by one bit then
2201 // subtract the difference from the found character, then do the or
2202 // trick. We avoid the theoretical case where negative numbers are
2203 // involved in order to simplify code generation.
2204 uint16_t mask = char_mask ^ diff;
2205 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
2206 diff,
2207 mask,
2208 on_failure);
2209 return true;
2210 }
2211 return false;
2212 }
2213
2214 // Only emits letters (things that have case). Only used for case independent
2215 // matches.
2216 static inline bool EmitAtomLetter(Isolate* isolate,
2217 RegExpCompiler* compiler,
2218 uint16_t c,
2219 BlockLabel* on_failure,
2220 intptr_t cp_offset,
2221 bool check,
2222 bool preloaded) {
2223 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2224 bool ascii = compiler->ascii();
2225 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2226 intptr_t length = GetCaseIndependentLetters(c, ascii, chars);
2227 if (length <= 1) return false;
2228 // We may not need to check against the end of the input string
2229 // if this character lies before a character that matched.
2230 if (!preloaded) {
2231 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
2232 }
2233 BlockLabel ok;
2234 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
2235 switch (length) {
2236 case 2: {
2237 if (ShortCutEmitCharacterPair(macro_assembler,
2238 ascii,
2239 chars[0],
2240 chars[1],
2241 on_failure)) {
2242 } else {
2243 macro_assembler->CheckCharacter(chars[0], &ok);
2244 macro_assembler->CheckNotCharacter(chars[1], on_failure);
2245 macro_assembler->BindBlock(&ok);
2246 }
2247 break;
2248 }
2249 case 4:
2250 macro_assembler->CheckCharacter(chars[3], &ok);
2251 // Fall through!
2252 case 3:
2253 macro_assembler->CheckCharacter(chars[0], &ok);
2254 macro_assembler->CheckCharacter(chars[1], &ok);
2255 macro_assembler->CheckNotCharacter(chars[2], on_failure);
2256 macro_assembler->BindBlock(&ok);
2257 break;
2258 default:
2259 UNREACHABLE();
2260 break;
2261 }
2262 return true;
2263 }
2264
2265
2266 // Only emits non-letters (things that don't have case). Only used for case
2267 // independent matches.
2268 static inline bool EmitAtomNonLetter(Isolate* isolate,
2269 RegExpCompiler* compiler,
2270 uint16_t c,
2271 BlockLabel* on_failure,
2272 intptr_t cp_offset,
2273 bool check,
2274 bool preloaded) {
2275 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2276 bool ascii = compiler->ascii();
2277 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2278 intptr_t length = GetCaseIndependentLetters(c, ascii, chars);
2279 if (length < 1) {
2280 // This can't match. Must be an ASCII subject and a non-ASCII character.
2281 // We do not need to do anything since the ASCII pass already handled this.
2282 return false; // Bounds not checked.
2283 }
2284 bool checked = false;
2285 // We handle the length > 1 case in a later pass.
2286 if (length == 1) {
2287 if (ascii && c > Symbols::kMaxOneCharCodeSymbol) {
2288 // Can't match - see above.
2289 return false; // Bounds not checked.
2290 }
2291 if (!preloaded) {
2292 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
2293 checked = check;
2294 }
2295 macro_assembler->CheckNotCharacter(c, on_failure);
2296 }
2297 return checked;
2298 }
2299
2300
2301 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
2302 intptr_t border,
2303 BlockLabel* fall_through,
2304 BlockLabel* above_or_equal,
2305 BlockLabel* below) {
2306 if (below != fall_through) {
2307 masm->CheckCharacterLT(border, below);
2308 if (above_or_equal != fall_through) masm->Jump(above_or_equal);
2309 } else {
2310 masm->CheckCharacterGT(border - 1, above_or_equal);
2311 }
2312 }
2313
2314
2315 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
2316 intptr_t first,
2317 intptr_t last,
2318 BlockLabel* fall_through,
2319 BlockLabel* in_range,
2320 BlockLabel* out_of_range) {
2321 if (in_range == fall_through) {
2322 if (first == last) {
2323 masm->CheckNotCharacter(first, out_of_range);
2324 } else {
2325 masm->CheckCharacterNotInRange(first, last, out_of_range);
2326 }
2327 } else {
2328 if (first == last) {
2329 masm->CheckCharacter(first, in_range);
2330 } else {
2331 masm->CheckCharacterInRange(first, last, in_range);
2332 }
2333 if (out_of_range != fall_through) masm->Jump(out_of_range);
2334 }
2335 }
2336
2337
2338 static void CutOutRange(RegExpMacroAssembler* masm,
2339 ZoneGrowableArray<int>* ranges,
2340 intptr_t start_index,
2341 intptr_t end_index,
2342 intptr_t cut_index,
2343 BlockLabel* even_label,
2344 BlockLabel* odd_label) {
2345 bool odd = (((cut_index - start_index) & 1) == 1);
2346 BlockLabel* in_range_label = odd ? odd_label : even_label;
2347 BlockLabel dummy;
2348 EmitDoubleBoundaryTest(masm,
2349 ranges->At(cut_index),
2350 ranges->At(cut_index + 1) - 1,
2351 &dummy,
2352 in_range_label,
2353 &dummy);
2354 ASSERT(!dummy.IsLinked());
2355 // Cut out the single range by rewriting the array. This creates a new
2356 // range that is a merger of the two ranges on either side of the one we
2357 // are cutting out. The oddity of the labels is preserved.
2358 for (intptr_t j = cut_index; j > start_index; j--) {
2359 ranges->At(j) = ranges->At(j - 1);
2360 }
2361 for (intptr_t j = cut_index + 1; j < end_index; j++) {
2362 ranges->At(j) = ranges->At(j + 1);
2363 }
2364 }
2365
2366
2367 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
2368 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
2369 static void EmitUseLookupTable(
2370 RegExpMacroAssembler* masm,
2371 ZoneGrowableArray<int>* ranges,
2372 intptr_t start_index,
2373 intptr_t end_index,
2374 intptr_t min_char,
2375 BlockLabel* fall_through,
2376 BlockLabel* even_label,
2377 BlockLabel* odd_label) {
2378 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;
2379 static const intptr_t kMask = RegExpMacroAssembler::kTableMask;
2380
2381 intptr_t base = (min_char & ~kMask);
2382
2383 // Assert that everything is on one kTableSize page.
2384 for (intptr_t i = start_index; i <= end_index; i++) {
2385 ASSERT((ranges->At(i) & ~kMask) == base);
2386 }
2387 ASSERT(start_index == 0 || (ranges->At(start_index - 1) & ~kMask) <= base);
2388
2389 char templ[kSize];
2390 BlockLabel* on_bit_set;
2391 BlockLabel* on_bit_clear;
2392 intptr_t bit;
2393 if (even_label == fall_through) {
2394 on_bit_set = odd_label;
2395 on_bit_clear = even_label;
2396 bit = 1;
2397 } else {
2398 on_bit_set = even_label;
2399 on_bit_clear = odd_label;
2400 bit = 0;
2401 }
2402 for (intptr_t i = 0; i < (ranges->At(start_index) & kMask) && i < kSize;
2403 i++) {
2404 templ[i] = bit;
2405 }
2406 intptr_t j = 0;
2407 bit ^= 1;
2408 for (intptr_t i = start_index; i < end_index; i++) {
2409 for (j = (ranges->At(i) & kMask); j < (ranges->At(i + 1) & kMask); j++) {
2410 templ[j] = bit;
2411 }
2412 bit ^= 1;
2413 }
2414 for (intptr_t i = j; i < kSize; i++) {
2415 templ[i] = bit;
2416 }
2417 // TODO(erikcorry): Cache these.
2418 TypedData& ba = TypedData::ZoneHandle(
2419 masm->isolate(),
2420 TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));
2421 for (intptr_t i = 0; i < kSize; i++) {
2422 ba.SetUint8(i, templ[i]);
2423 }
2424 masm->CheckBitInTable(ba, on_bit_set);
2425 if (on_bit_clear != fall_through) masm->Jump(on_bit_clear);
2426 }
2427
2428
2429 // Unicode case. Split the search space into kSize spaces that are handled
2430 // with recursion.
2431 static void SplitSearchSpace(ZoneGrowableArray<int>* ranges,
2432 intptr_t start_index,
2433 intptr_t end_index,
2434 int* new_start_index,
2435 int* new_end_index,
2436 int* border) {
2437 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;
2438 static const intptr_t kMask = RegExpMacroAssembler::kTableMask;
2439
2440 intptr_t first = ranges->At(start_index);
2441 intptr_t last = ranges->At(end_index) - 1;
2442
2443 *new_start_index = start_index;
2444 *border = (ranges->At(start_index) & ~kMask) + kSize;
2445 while (*new_start_index < end_index) {
2446 if (ranges->At(*new_start_index) > *border) break;
2447 (*new_start_index)++;
2448 }
2449 // new_start_index is the index of the first edge that is beyond the
2450 // current kSize space.
2451
2452 // For very large search spaces we do a binary chop search of the non-ASCII
2453 // space instead of just going to the end of the current kSize space. The
2454 // heuristics are complicated a little by the fact that any 128-character
2455 // encoding space can be quickly tested with a table lookup, so we don't
2456 // wish to do binary chop search at a smaller granularity than that. A
2457 // 128-character space can take up a lot of space in the ranges array if,
2458 // for example, we only want to match every second character (eg. the lower
2459 // case characters on some Unicode pages).
2460 intptr_t binary_chop_index = (end_index + start_index) / 2;
2461 // The first test ensures that we get to the code that handles the ASCII
2462 // range with a single not-taken branch, speeding up this important
2463 // character range (even non-ASCII charset-based text has spaces and
2464 // punctuation).
2465 if (*border - 1 > Symbols::kMaxOneCharCodeSymbol && // ASCII case.
2466 end_index - start_index > (*new_start_index - start_index) * 2 &&
2467 last - first > kSize * 2 &&
2468 binary_chop_index > *new_start_index &&
2469 ranges->At(binary_chop_index) >= first + 2 * kSize) {
2470 intptr_t scan_forward_for_section_border = binary_chop_index;;
2471 intptr_t new_border = (ranges->At(binary_chop_index) | kMask) + 1;
2472
2473 while (scan_forward_for_section_border < end_index) {
2474 if (ranges->At(scan_forward_for_section_border) > new_border) {
2475 *new_start_index = scan_forward_for_section_border;
2476 *border = new_border;
2477 break;
2478 }
2479 scan_forward_for_section_border++;
2480 }
2481 }
2482
2483 ASSERT(*new_start_index > start_index);
2484 *new_end_index = *new_start_index - 1;
2485 if (ranges->At(*new_end_index) == *border) {
2486 (*new_end_index)--;
2487 }
2488 if (*border >= ranges->At(end_index)) {
2489 *border = ranges->At(end_index);
2490 *new_start_index = end_index; // Won't be used.
2491 *new_end_index = end_index - 1;
2492 }
2493 }
2494
2495
2496 // Gets a series of segment boundaries representing a character class. If the
2497 // character is in the range between an even and an odd boundary (counting from
2498 // start_index) then go to even_label, otherwise go to odd_label. We already
2499 // know that the character is in the range of min_char to max_char inclusive.
2500 // Either label can be NULL indicating backtracking. Either label can also be
2501 // equal to the fall_through label.
2502 static void GenerateBranches(RegExpMacroAssembler* masm,
2503 ZoneGrowableArray<int>* ranges,
2504 intptr_t start_index,
2505 intptr_t end_index,
2506 uint16_t min_char,
2507 uint16_t max_char,
2508 BlockLabel* fall_through,
2509 BlockLabel* even_label,
2510 BlockLabel* odd_label) {
2511 intptr_t first = ranges->At(start_index);
2512 intptr_t last = ranges->At(end_index) - 1;
2513
2514 ASSERT(min_char < first);
2515
2516 // Just need to test if the character is before or on-or-after
2517 // a particular character.
2518 if (start_index == end_index) {
2519 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
2520 return;
2521 }
2522
2523 // Another almost trivial case: There is one interval in the middle that is
2524 // different from the end intervals.
2525 if (start_index + 1 == end_index) {
2526 EmitDoubleBoundaryTest(
2527 masm, first, last, fall_through, even_label, odd_label);
2528 return;
2529 }
2530
2531 // It's not worth using table lookup if there are very few intervals in the
2532 // character class.
2533 if (end_index - start_index <= 6) {
2534 // It is faster to test for individual characters, so we look for those
2535 // first, then try arbitrary ranges in the second round.
2536 static intptr_t kNoCutIndex = -1;
2537 intptr_t cut = kNoCutIndex;
2538 for (intptr_t i = start_index; i < end_index; i++) {
2539 if (ranges->At(i) == ranges->At(i + 1) - 1) {
2540 cut = i;
2541 break;
2542 }
2543 }
2544 if (cut == kNoCutIndex) cut = start_index;
2545 CutOutRange(
2546 masm, ranges, start_index, end_index, cut, even_label, odd_label);
2547 ASSERT(end_index - start_index >= 2);
2548 GenerateBranches(masm,
2549 ranges,
2550 start_index + 1,
2551 end_index - 1,
2552 min_char,
2553 max_char,
2554 fall_through,
2555 even_label,
2556 odd_label);
2557 return;
2558 }
2559
2560 // If there are a lot of intervals in the regexp, then we will use tables to
2561 // determine whether the character is inside or outside the character class.
2562 static const intptr_t kBits = RegExpMacroAssembler::kTableSizeBits;
2563
2564 if ((max_char >> kBits) == (min_char >> kBits)) {
2565 EmitUseLookupTable(masm,
2566 ranges,
2567 start_index,
2568 end_index,
2569 min_char,
2570 fall_through,
2571 even_label,
2572 odd_label);
2573 return;
2574 }
2575
2576 if ((min_char >> kBits) != (first >> kBits)) {
2577 masm->CheckCharacterLT(first, odd_label);
2578 GenerateBranches(masm,
2579 ranges,
2580 start_index + 1,
2581 end_index,
2582 first,
2583 max_char,
2584 fall_through,
2585 odd_label,
2586 even_label);
2587 return;
2588 }
2589
2590 intptr_t new_start_index = 0;
2591 intptr_t new_end_index = 0;
2592 intptr_t border = 0;
2593
2594 SplitSearchSpace(ranges,
2595 start_index,
2596 end_index,
2597 &new_start_index,
2598 &new_end_index,
2599 &border);
2600
2601 BlockLabel handle_rest;
2602 BlockLabel* above = &handle_rest;
2603 if (border == last + 1) {
2604 // We didn't find any section that started after the limit, so everything
2605 // above the border is one of the terminal labels.
2606 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2607 ASSERT(new_end_index == end_index - 1);
2608 }
2609
2610 ASSERT(start_index <= new_end_index);
2611 ASSERT(new_start_index <= end_index);
2612 ASSERT(start_index < new_start_index);
2613 ASSERT(new_end_index < end_index);
2614 ASSERT(new_end_index + 1 == new_start_index ||
2615 (new_end_index + 2 == new_start_index &&
2616 border == ranges->At(new_end_index + 1)));
2617 ASSERT(min_char < border - 1);
2618 ASSERT(border < max_char);
2619 ASSERT(ranges->At(new_end_index) < border);
2620 ASSERT(border < ranges->At(new_start_index) ||
2621 (border == ranges->At(new_start_index) &&
2622 new_start_index == end_index &&
2623 new_end_index == end_index - 1 &&
2624 border == last + 1));
2625 ASSERT(new_start_index == 0 || border >= ranges->At(new_start_index - 1));
2626
2627 masm->CheckCharacterGT(border - 1, above);
2628 BlockLabel dummy;
2629 GenerateBranches(masm,
2630 ranges,
2631 start_index,
2632 new_end_index,
2633 min_char,
2634 border - 1,
2635 &dummy,
2636 even_label,
2637 odd_label);
2638
2639 if (handle_rest.IsLinked()) {
2640 masm->BindBlock(&handle_rest);
2641 bool flip = (new_start_index & 1) != (start_index & 1);
2642 GenerateBranches(masm,
2643 ranges,
2644 new_start_index,
2645 end_index,
2646 border,
2647 max_char,
2648 &dummy,
2649 flip ? odd_label : even_label,
2650 flip ? even_label : odd_label);
2651 }
2652 }
2653
2654
2655 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2656 RegExpCharacterClass* cc,
2657 bool ascii,
2658 BlockLabel* on_failure,
2659 intptr_t cp_offset,
2660 bool check_offset,
2661 bool preloaded,
2662 Isolate* isolate) {
2663 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
2664 if (!CharacterRange::IsCanonical(ranges)) {
2665 CharacterRange::Canonicalize(ranges);
2666 }
2667
2668 intptr_t max_char;
2669 if (ascii) {
2670 max_char = Symbols::kMaxOneCharCodeSymbol;
2671 } else {
2672 max_char = Utf16::kMaxCodeUnit;
2673 }
2674
2675 intptr_t range_count = ranges->length();
2676
2677 intptr_t last_valid_range = range_count - 1;
2678 while (last_valid_range >= 0) {
2679 CharacterRange& range = ranges->At(last_valid_range);
2680 if (range.from() <= max_char) {
2681 break;
2682 }
2683 last_valid_range--;
2684 }
2685
2686 if (last_valid_range < 0) {
2687 if (!cc->is_negated()) {
2688 macro_assembler->GoTo(on_failure);
2689 }
2690 if (check_offset) {
2691 macro_assembler->CheckPosition(cp_offset, on_failure);
2692 }
2693 return;
2694 }
2695
2696 if (last_valid_range == 0 &&
2697 ranges->At(0).IsEverything(max_char)) {
2698 if (cc->is_negated()) {
2699 macro_assembler->GoTo(on_failure);
2700 } else {
2701 // This is a common case hit by non-anchored expressions.
2702 if (check_offset) {
2703 macro_assembler->CheckPosition(cp_offset, on_failure);
2704 }
2705 }
2706 return;
2707 }
2708 if (last_valid_range == 0 &&
2709 !cc->is_negated() &&
2710 ranges->At(0).IsEverything(max_char)) {
2711 // This is a common case hit by non-anchored expressions.
2712 if (check_offset) {
2713 macro_assembler->CheckPosition(cp_offset, on_failure);
2714 }
2715 return;
2716 }
2717
2718 if (!preloaded) {
2719 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2720 }
2721
2722 if (cc->is_standard() &&
2723 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2724 on_failure)) {
2725 return;
2726 }
2727
2728
2729 // A new list with ascending entries. Each entry is a code unit
2730 // where there is a boundary between code units that are part of
2731 // the class and code units that are not. Normally we insert an
2732 // entry at zero which goes to the failure label, but if there
2733 // was already one there we fall through for success on that entry.
2734 // Subsequent entries have alternating meaning (success/failure).
2735 ZoneGrowableArray<int>* range_boundaries =
2736 new(isolate) ZoneGrowableArray<int>(last_valid_range);
2737
2738 bool zeroth_entry_is_failure = !cc->is_negated();
2739
2740 for (intptr_t i = 0; i <= last_valid_range; i++) {
2741 CharacterRange& range = ranges->At(i);
2742 if (range.from() == 0) {
2743 ASSERT(i == 0);
2744 zeroth_entry_is_failure = !zeroth_entry_is_failure;
2745 } else {
2746 range_boundaries->Add(range.from());
2747 }
2748 range_boundaries->Add(range.to() + 1);
2749 }
2750 intptr_t end_index = range_boundaries->length() - 1;
2751 if (range_boundaries->At(end_index) > max_char) {
2752 end_index--;
2753 }
2754
2755 BlockLabel fall_through;
2756 GenerateBranches(macro_assembler,
2757 range_boundaries,
2758 0, // start_index.
2759 end_index,
2760 0, // min_char.
2761 max_char,
2762 &fall_through,
2763 zeroth_entry_is_failure ? &fall_through : on_failure,
2764 zeroth_entry_is_failure ? on_failure : &fall_through);
2765 macro_assembler->BindBlock(&fall_through);
2766 }
2767
2768
767 // We call this repeatedly to generate code for each pass over the text node. 2769 // We call this repeatedly to generate code for each pass over the text node.
768 // The passes are in increasing order of difficulty because we hope one 2770 // The passes are in increasing order of difficulty because we hope one
769 // of the first passes will fail in which case we are saved the work of the 2771 // of the first passes will fail in which case we are saved the work of the
770 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ 2772 // later passes. for example for the case independent regexp /%[asdfghjkl]a/
771 // we will check the '%' in the first pass, the case independent 'a' in the 2773 // we will check the '%' in the first pass, the case independent 'a' in the
772 // second pass and the character class in the last pass. 2774 // second pass and the character class in the last pass.
773 // 2775 //
774 // The passes are done from right to left, so for example to test for /bar/ 2776 // The passes are done from right to left, so for example to test for /bar/
775 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 2777 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
776 // and then a 'b' with offset 0. This means we can avoid the end-of-input 2778 // and then a 'b' with offset 0. This means we can avoid the end-of-input
(...skipping 15 matching lines...) Expand all
792 // loading characters, which means we do not need to recheck the bounds 2794 // loading characters, which means we do not need to recheck the bounds
793 // up to the limit the quick check already checked. In addition the quick 2795 // up to the limit the quick check already checked. In addition the quick
794 // check can have involved a mask and compare operation which may simplify 2796 // check can have involved a mask and compare operation which may simplify
795 // or obviate the need for further checks at some character positions. 2797 // or obviate the need for further checks at some character positions.
796 void TextNode::TextEmitPass(RegExpCompiler* compiler, 2798 void TextNode::TextEmitPass(RegExpCompiler* compiler,
797 TextEmitPassType pass, 2799 TextEmitPassType pass,
798 bool preloaded, 2800 bool preloaded,
799 Trace* trace, 2801 Trace* trace,
800 bool first_element_checked, 2802 bool first_element_checked,
801 intptr_t* checked_up_to) { 2803 intptr_t* checked_up_to) {
802 UNIMPLEMENTED(); 2804 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2805 bool ascii = compiler->ascii();
2806 BlockLabel* backtrack = trace->backtrack();
2807 QuickCheckDetails* quick_check = trace->quick_check_performed();
2808 intptr_t element_count = elms_->length();
2809 for (intptr_t i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2810 TextElement elm = elms_->At(i);
2811 intptr_t cp_offset = trace->cp_offset() + elm.cp_offset();
2812 if (elm.text_type() == TextElement::ATOM) {
2813 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
2814 for (intptr_t j = preloaded ? 0 : quarks->length() - 1; j >= 0; j--) {
2815 if (first_element_checked && i == 0 && j == 0) continue;
2816 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
2817 EmitCharacterFunction* emit_function = NULL;
2818 switch (pass) {
2819 case NON_ASCII_MATCH:
2820 ASSERT(ascii);
2821 if (quarks->At(j) > Symbols::kMaxOneCharCodeSymbol) {
2822 assembler->GoTo(backtrack);
2823 return;
2824 }
2825 break;
2826 case NON_LETTER_CHARACTER_MATCH:
2827 emit_function = &EmitAtomNonLetter;
2828 break;
2829 case SIMPLE_CHARACTER_MATCH:
2830 emit_function = &EmitSimpleCharacter;
2831 break;
2832 case CASE_CHARACTER_MATCH:
2833 emit_function = &EmitAtomLetter;
2834 break;
2835 default:
2836 break;
2837 }
2838 if (emit_function != NULL) {
2839 bool bound_checked = emit_function(I,
2840 compiler,
2841 quarks->At(j),
2842 backtrack,
2843 cp_offset + j,
2844 *checked_up_to < cp_offset + j,
2845 preloaded);
2846 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2847 }
2848 }
2849 } else {
2850 ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
2851 if (pass == CHARACTER_CLASS_MATCH) {
2852 if (first_element_checked && i == 0) continue;
2853 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
2854 RegExpCharacterClass* cc = elm.char_class();
2855 EmitCharClass(assembler,
2856 cc,
2857 ascii,
2858 backtrack,
2859 cp_offset,
2860 *checked_up_to < cp_offset,
2861 preloaded,
2862 I);
2863 UpdateBoundsCheck(cp_offset, checked_up_to);
2864 }
2865 }
2866 }
803 } 2867 }
804 2868
805 2869
806 intptr_t ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, 2870 intptr_t ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
807 intptr_t eats_at_least) { 2871 intptr_t eats_at_least) {
808 UNIMPLEMENTED(); 2872 intptr_t preload_characters = Utils::Minimum(4, eats_at_least);
809 return NULL; 2873 if (compiler->macro_assembler()->CanReadUnaligned()) {
2874 bool ascii = compiler->ascii();
2875 if (ascii) {
2876 if (preload_characters > 4) preload_characters = 4;
2877 // We can't preload 3 characters because there is no machine instruction
2878 // to do that. We can't just load 4 because we could be reading
2879 // beyond the end of the string, which could cause a memory fault.
2880 if (preload_characters == 3) preload_characters = 2;
2881 } else {
2882 if (preload_characters > 2) preload_characters = 2;
2883 }
2884 } else {
2885 if (preload_characters > 1) preload_characters = 1;
2886 }
2887 return preload_characters;
810 } 2888 }
811 2889
812 2890
813 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
814 Trace* trace,
815 GuardedAlternative alternative,
816 AlternativeGeneration* alt_gen,
817 intptr_t preload_characters,
818 bool next_expects_preload) {
819 UNIMPLEMENTED();
820 }
821
822
823 intptr_t TextNode::EatsAtLeast(intptr_t still_to_find, 2891 intptr_t TextNode::EatsAtLeast(intptr_t still_to_find,
824 intptr_t budget, 2892 intptr_t budget,
825 bool not_at_start) { 2893 bool not_at_start) {
826 intptr_t answer = Length(); 2894 intptr_t answer = Length();
827 if (answer >= still_to_find) return answer; 2895 if (answer >= still_to_find) return answer;
828 if (budget <= 0) return answer; 2896 if (budget <= 0) return answer;
829 // We are not at start after this node so we set the last argument to 'true'. 2897 // We are not at start after this node so we set the last argument to 'true'.
830 return answer + on_success()->EatsAtLeast(still_to_find - answer, 2898 return answer + on_success()->EatsAtLeast(still_to_find - answer,
831 budget - 1, 2899 budget - 1,
832 true); 2900 true);
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after
904 intptr_t budget, 2972 intptr_t budget,
905 bool not_at_start) { 2973 bool not_at_start) {
906 if (budget <= 0) return 0; 2974 if (budget <= 0) return 0;
907 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2975 // Alternative 0 is the negative lookahead, alternative 1 is what comes
908 // afterwards. 2976 // afterwards.
909 RegExpNode* node = alternatives_->At(1).node(); 2977 RegExpNode* node = alternatives_->At(1).node();
910 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start); 2978 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
911 } 2979 }
912 2980
913 2981
2982 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
2983 static inline uint32_t SmearBitsRight(uint32_t v) {
2984 v |= v >> 1;
2985 v |= v >> 2;
2986 v |= v >> 4;
2987 v |= v >> 8;
2988 v |= v >> 16;
2989 return v;
2990 }
2991
2992
914 // Here is the meat of GetQuickCheckDetails (see also the comment on the 2993 // Here is the meat of GetQuickCheckDetails (see also the comment on the
915 // super-class in the .h file). 2994 // super-class in the .h file).
916 // 2995 //
917 // We iterate along the text object, building up for each character a 2996 // We iterate along the text object, building up for each character a
918 // mask and value that can be used to test for a quick failure to match. 2997 // mask and value that can be used to test for a quick failure to match.
919 // The masks and values for the positions will be combined into a single 2998 // The masks and values for the positions will be combined into a single
920 // machine word for the current character width in order to be used in 2999 // machine word for the current character width in order to be used in
921 // generating a quick check. 3000 // generating a quick check.
922 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 3001 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
923 RegExpCompiler* compiler, 3002 RegExpCompiler* compiler,
924 intptr_t characters_filled_in, 3003 intptr_t characters_filled_in,
925 bool not_at_start) { 3004 bool not_at_start) {
926 UNIMPLEMENTED(); 3005 ASSERT(characters_filled_in < details->characters());
3006 intptr_t characters = details->characters();
3007 intptr_t char_mask;
3008 if (compiler->ascii()) {
3009 char_mask = Symbols::kMaxOneCharCodeSymbol;
3010 } else {
3011 char_mask = Utf16::kMaxCodeUnit;
3012 }
3013 for (intptr_t k = 0; k < elms_->length(); k++) {
3014 TextElement elm = elms_->At(k);
3015 if (elm.text_type() == TextElement::ATOM) {
3016 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
3017 for (intptr_t i = 0; i < characters && i < quarks->length(); i++) {
3018 QuickCheckDetails::Position* pos =
3019 details->positions(characters_filled_in);
3020 uint16_t c = quarks->At(i);
3021 if (c > char_mask) {
3022 // If we expect a non-ASCII character from an ASCII string,
3023 // there is no way we can match. Not even case independent
3024 // matching can turn an ASCII character into non-ASCII or
3025 // vice versa.
3026 details->set_cannot_match();
3027 pos->determines_perfectly = false;
3028 return;
3029 }
3030 if (compiler->ignore_case()) {
3031 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3032 intptr_t length =
3033 GetCaseIndependentLetters(c, compiler->ascii(), chars);
3034 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
3035 if (length == 1) {
3036 // This letter has no case equivalents, so it's nice and simple
3037 // and the mask-compare will determine definitely whether we have
3038 // a match at this character position.
3039 pos->mask = char_mask;
3040 pos->value = c;
3041 pos->determines_perfectly = true;
3042 } else {
3043 uint32_t common_bits = char_mask;
3044 uint32_t bits = chars[0];
3045 for (intptr_t j = 1; j < length; j++) {
3046 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
3047 common_bits ^= differing_bits;
3048 bits &= common_bits;
3049 }
3050 // If length is 2 and common bits has only one zero in it then
3051 // our mask and compare instruction will determine definitely
3052 // whether we have a match at this character position. Otherwise
3053 // it can only be an approximate check.
3054 uint32_t one_zero = (common_bits | ~char_mask);
3055 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
3056 pos->determines_perfectly = true;
3057 }
3058 pos->mask = common_bits;
3059 pos->value = bits;
3060 }
3061 } else {
3062 // Don't ignore case. Nice simple case where the mask-compare will
3063 // determine definitely whether we have a match at this character
3064 // position.
3065 pos->mask = char_mask;
3066 pos->value = c;
3067 pos->determines_perfectly = true;
3068 }
3069 characters_filled_in++;
3070 ASSERT(characters_filled_in <= details->characters());
3071 if (characters_filled_in == details->characters()) {
3072 return;
3073 }
3074 }
3075 } else {
3076 QuickCheckDetails::Position* pos =
3077 details->positions(characters_filled_in);
3078 RegExpCharacterClass* tree = elm.char_class();
3079 ZoneGrowableArray<CharacterRange>* ranges = tree->ranges();
3080 if (tree->is_negated()) {
3081 // A quick check uses multi-character mask and compare. There is no
3082 // useful way to incorporate a negative char class into this scheme
3083 // so we just conservatively create a mask and value that will always
3084 // succeed.
3085 pos->mask = 0;
3086 pos->value = 0;
3087 } else {
3088 intptr_t first_range = 0;
3089 while (ranges->At(first_range).from() > char_mask) {
3090 first_range++;
3091 if (first_range == ranges->length()) {
3092 details->set_cannot_match();
3093 pos->determines_perfectly = false;
3094 return;
3095 }
3096 }
3097 CharacterRange range = ranges->At(first_range);
3098 uint16_t from = range.from();
3099 uint16_t to = range.to();
3100 if (to > char_mask) {
3101 to = char_mask;
3102 }
3103 uint32_t differing_bits = (from ^ to);
3104 // A mask and compare is only perfect if the differing bits form a
3105 // number like 00011111 with one single block of trailing 1s.
3106 if ((differing_bits & (differing_bits + 1)) == 0 &&
3107 from + differing_bits == to) {
3108 pos->determines_perfectly = true;
3109 }
3110 uint32_t common_bits = ~SmearBitsRight(differing_bits);
3111 uint32_t bits = (from & common_bits);
3112 for (intptr_t i = first_range + 1; i < ranges->length(); i++) {
3113 CharacterRange range = ranges->At(i);
3114 uint16_t from = range.from();
3115 uint16_t to = range.to();
3116 if (from > char_mask) continue;
3117 if (to > char_mask) to = char_mask;
3118 // Here we are combining more ranges into the mask and compare
3119 // value. With each new range the mask becomes more sparse and
3120 // so the chances of a false positive rise. A character class
3121 // with multiple ranges is assumed never to be equivalent to a
3122 // mask and compare operation.
3123 pos->determines_perfectly = false;
3124 uint32_t new_common_bits = (from ^ to);
3125 new_common_bits = ~SmearBitsRight(new_common_bits);
3126 common_bits &= new_common_bits;
3127 bits &= new_common_bits;
3128 uint32_t differing_bits = (from & common_bits) ^ bits;
3129 common_bits ^= differing_bits;
3130 bits &= common_bits;
3131 }
3132 pos->mask = common_bits;
3133 pos->value = bits;
3134 }
3135 characters_filled_in++;
3136 ASSERT(characters_filled_in <= details->characters());
3137 if (characters_filled_in == details->characters()) {
3138 return;
3139 }
3140 }
3141 }
3142 ASSERT(characters_filled_in != details->characters());
3143 if (!details->cannot_match()) {
3144 on_success()-> GetQuickCheckDetails(details,
3145 compiler,
3146 characters_filled_in,
3147 true);
3148 }
927 } 3149 }
928 3150
929 3151
930 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 3152 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
931 RegExpCompiler* compiler, 3153 RegExpCompiler* compiler,
932 intptr_t characters_filled_in, 3154 intptr_t characters_filled_in,
933 bool not_at_start) { 3155 bool not_at_start) {
934 UNIMPLEMENTED(); 3156 if (body_can_be_zero_length_ || info()->visited) return;
3157 VisitMarker marker(info());
3158 return ChoiceNode::GetQuickCheckDetails(details,
3159 compiler,
3160 characters_filled_in,
3161 not_at_start);
935 } 3162 }
936 3163
937 3164
938 intptr_t TextNode::Length() { 3165 intptr_t TextNode::Length() {
939 TextElement elm = elms_->Last(); 3166 TextElement elm = elms_->Last();
940 ASSERT(elm.cp_offset() >= 0); 3167 ASSERT(elm.cp_offset() >= 0);
941 return elm.cp_offset() + elm.length(); 3168 return elm.cp_offset() + elm.length();
942 } 3169 }
943 3170
944 3171
(...skipping 518 matching lines...) Expand 10 before | Expand all | Expand 10 after
1463 Isolate* isolate) { 3690 Isolate* isolate) {
1464 uint16_t bottom = from(); 3691 uint16_t bottom = from();
1465 uint16_t top = to(); 3692 uint16_t top = to();
1466 if (is_ascii && !RangeContainsLatin1Equivalents(*this)) { 3693 if (is_ascii && !RangeContainsLatin1Equivalents(*this)) {
1467 if (bottom > Symbols::kMaxOneCharCodeSymbol) return; 3694 if (bottom > Symbols::kMaxOneCharCodeSymbol) return;
1468 if (top > Symbols::kMaxOneCharCodeSymbol) { 3695 if (top > Symbols::kMaxOneCharCodeSymbol) {
1469 top = Symbols::kMaxOneCharCodeSymbol; 3696 top = Symbols::kMaxOneCharCodeSymbol;
1470 } 3697 }
1471 } 3698 }
1472 3699
1473 UNIMPLEMENTED(); 3700 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize;
3701 unibrow::Mapping<unibrow::CanonicalizationRange> jsregexp_canonrange;
3702 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3703 if (top == bottom) {
3704 // If this is a singleton we just expand the one character.
3705 intptr_t length = jsregexp_uncanonicalize.get(bottom, '\0', chars); // NOLIN T
3706 for (intptr_t i = 0; i < length; i++) {
3707 uint32_t chr = chars[i];
3708 if (chr != bottom) {
3709 ranges->Add(CharacterRange::Singleton(chars[i]));
3710 }
3711 }
3712 } else {
3713 // If this is a range we expand the characters block by block,
3714 // expanding contiguous subranges (blocks) one at a time.
3715 // The approach is as follows. For a given start character we
3716 // look up the remainder of the block that contains it (represented
3717 // by the end point), for instance we find 'z' if the character
3718 // is 'c'. A block is characterized by the property
3719 // that all characters uncanonicalize in the same way, except that
3720 // each entry in the result is incremented by the distance from the first
3721 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and // NOLINT
3722 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
3723 // Once we've found the end point we look up its uncanonicalization
3724 // and produce a range for each element. For instance for [c-f]
3725 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
3726 // add a range if it is not already contained in the input, so [c-f]
3727 // will be skipped but [C-F] will be added. If this range is not
3728 // completely contained in a block we do this for all the blocks
3729 // covered by the range (handling characters that is not in a block
3730 // as a "singleton block").
3731 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3732 intptr_t pos = bottom;
3733 while (pos <= top) {
3734 intptr_t length = jsregexp_canonrange.get(pos, '\0', range);
3735 uint16_t block_end;
3736 if (length == 0) {
3737 block_end = pos;
3738 } else {
3739 ASSERT(length == 1);
3740 block_end = range[0];
3741 }
3742 intptr_t end = (block_end > top) ? top : block_end;
3743 length = jsregexp_uncanonicalize.get(block_end, '\0', range); // NOLINT
3744 for (intptr_t i = 0; i < length; i++) {
3745 uint32_t c = range[i];
3746 uint16_t range_from = c - (block_end - pos);
3747 uint16_t range_to = c - (block_end - end);
3748 if (!(bottom <= range_from && range_to <= top)) {
3749 ranges->Add(CharacterRange(range_from, range_to));
3750 }
3751 }
3752 pos = end + 1;
3753 }
3754 }
3755 }
1474 3756
1475 // unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 3757
1476 // if (top == bottom) { 3758 // -------------------------------------------------------------------
1477 // // If this is a singleton we just expand the one character. 3759 // Splay tree
1478 // intptr_t length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', ch ars); // NOLINT 3760
1479 // for (intptr_t i = 0; i < length; i++) { 3761
1480 // uint32_t chr = chars[i]; 3762 // Workaround for the fact that ZoneGrowableArray does not have contains().
1481 // if (chr != bottom) { 3763 static bool ArrayContains(ZoneGrowableArray<unsigned>* array,
1482 // ranges->Add(CharacterRange::Singleton(chars[i]), zone); 3764 unsigned value) {
1483 // } 3765 for (intptr_t i = 0; i < array->length(); i++) {
1484 // } 3766 if (array->At(i) == value) {
1485 // } else { 3767 return true;
1486 // // If this is a range we expand the characters block by block, 3768 }
1487 // // expanding contiguous subranges (blocks) one at a time. 3769 }
1488 // // The approach is as follows. For a given start character we 3770 return false;
1489 // // look up the remainder of the block that contains it (represented 3771 }
1490 // // by the end point), for instance we find 'z' if the character 3772
1491 // // is 'c'. A block is characterized by the property 3773
1492 // // that all characters uncanonicalize in the same way, except that 3774 void OutSet::Set(unsigned value, Isolate* isolate) {
1493 // // each entry in the result is incremented by the distance from the first 3775 if (value < kFirstLimit) {
1494 // // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] a nd // NOLINT 3776 first_ |= (1 << value);
1495 // // the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. 3777 } else {
1496 // // Once we've found the end point we look up its uncanonicalization 3778 if (remaining_ == NULL)
1497 // // and produce a range for each element. For instance for [c-f] 3779 remaining_ = new(isolate) ZoneGrowableArray<unsigned>(1);
1498 // // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only 3780
1499 // // add a range if it is not already contained in the input, so [c-f] 3781 bool remaining_contains_value = ArrayContains(remaining_, value);
1500 // // will be skipped but [C-F] will be added. If this range is not 3782 if (remaining_->is_empty() || !remaining_contains_value) {
1501 // // completely contained in a block we do this for all the blocks 3783 remaining_->Add(value);
1502 // // covered by the range (handling characters that is not in a block 3784 }
1503 // // as a "singleton block"). 3785 }
1504 // unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 3786 }
1505 // intptr_t pos = bottom; 3787
1506 // while (pos <= top) { 3788
1507 // intptr_t length = isolate->jsregexp_canonrange()->get(pos, '\0', range); 3789 bool OutSet::Get(unsigned value) const {
1508 // uint16_t block_end; 3790 if (value < kFirstLimit) {
1509 // if (length == 0) { 3791 return (first_ & (1 << value)) != 0;
1510 // block_end = pos; 3792 } else if (remaining_ == NULL) {
1511 // } else { 3793 return false;
1512 // ASSERT(length == 1); 3794 } else {
1513 // block_end = range[0]; 3795 return ArrayContains(remaining_, value);
1514 // } 3796 }
1515 // intptr_t end = (block_end > top) ? top : block_end;
1516 // length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range) ; // NOLINT
1517 // for (intptr_t i = 0; i < length; i++) {
1518 // uint32_t c = range[i];
1519 // uint16_t range_from = c - (block_end - pos);
1520 // uint16_t range_to = c - (block_end - end);
1521 // if (!(bottom <= range_from && range_to <= top)) {
1522 // ranges->Add(CharacterRange(range_from, range_to));
1523 // }
1524 // }
1525 // pos = end + 1;
1526 // }
1527 // }
1528 } 3797 }
1529 3798
1530 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 3799 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
1531 RegExpCompiler* compiler, 3800 RegExpCompiler* compiler,
1532 intptr_t filled_in, 3801 intptr_t filled_in,
1533 bool not_at_start) { 3802 bool not_at_start) {
1534 if (assertion_type_ == AT_START && not_at_start) { 3803 if (assertion_type_ == AT_START && not_at_start) {
1535 details->set_cannot_match(); 3804 details->set_cannot_match();
1536 return; 3805 return;
1537 } 3806 }
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
1571 intptr_t LoopChoiceNode::EatsAtLeast(intptr_t still_to_find, 3840 intptr_t LoopChoiceNode::EatsAtLeast(intptr_t still_to_find,
1572 intptr_t budget, 3841 intptr_t budget,
1573 bool not_at_start) { 3842 bool not_at_start) {
1574 return EatsAtLeastHelper(still_to_find, 3843 return EatsAtLeastHelper(still_to_find,
1575 budget - 1, 3844 budget - 1,
1576 loop_node_, 3845 loop_node_,
1577 not_at_start); 3846 not_at_start);
1578 } 3847 }
1579 3848
1580 3849
1581 class RecursionCheck {
1582 public:
1583 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1584 compiler->IncrementRecursionDepth();
1585 }
1586 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1587 private:
1588 RegExpCompiler* compiler_;
1589 };
1590
1591
1592 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { 3850 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
1593 UNIMPLEMENTED(); 3851 UNIMPLEMENTED();
1594 return NULL; 3852 return NULL;
1595 } 3853 }
1596 3854
1597 3855
1598 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 3856 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
1599 RegExpCompiler* compiler, 3857 RegExpCompiler* compiler,
1600 intptr_t characters_filled_in, 3858 intptr_t characters_filled_in,
1601 bool not_at_start) { 3859 bool not_at_start) {
(...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after
1720 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { 3978 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
1721 return RegExpEngine::CompilationResult("RegExp too big"); 3979 return RegExpEngine::CompilationResult("RegExp too big");
1722 } 3980 }
1723 3981
1724 3982
1725 // Attempts to compile the regexp using an Irregexp code generator. Returns 3983 // Attempts to compile the regexp using an Irregexp code generator. Returns
1726 // a fixed array or a null handle depending on whether it succeeded. 3984 // a fixed array or a null handle depending on whether it succeeded.
1727 RegExpCompiler::RegExpCompiler(intptr_t capture_count, bool ignore_case, 3985 RegExpCompiler::RegExpCompiler(intptr_t capture_count, bool ignore_case,
1728 bool ascii) 3986 bool ascii)
1729 : next_register_(2 * (capture_count + 1)), 3987 : next_register_(2 * (capture_count + 1)),
3988 work_list_(NULL),
1730 recursion_depth_(0), 3989 recursion_depth_(0),
1731 ignore_case_(ignore_case), 3990 ignore_case_(ignore_case),
1732 ascii_(ascii), 3991 ascii_(ascii),
1733 reg_exp_too_big_(false), 3992 reg_exp_too_big_(false),
1734 current_expansion_factor_(1), 3993 current_expansion_factor_(1),
1735 isolate_(Isolate::Current()) { 3994 isolate_(Isolate::Current()) {
1736 accept_ = new(I) EndNode(EndNode::ACCEPT, I); 3995 accept_ = new(I) EndNode(EndNode::ACCEPT, I);
1737 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); 3996 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1738 } 3997 }
1739 3998
1740 3999
1741 RegExpEngine::CompilationResult RegExpCompiler::Assemble( 4000 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1742 RegExpMacroAssembler* macro_assembler, 4001 RegExpMacroAssembler* macro_assembler,
1743 RegExpNode* start, 4002 RegExpNode* start,
1744 intptr_t capture_count, 4003 intptr_t capture_count,
1745 const String& pattern) { 4004 const String& pattern) {
1746 UNIMPLEMENTED(); 4005 static const bool use_slow_safe_regexp_compiler = false;
1747 return RegExpEngine::CompilationResult(""); 4006
4007 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
4008 macro_assembler_ = macro_assembler;
4009
4010 ZoneGrowableArray<RegExpNode*> work_list(0);
4011 work_list_ = &work_list;
4012 BlockLabel fail;
4013 macro_assembler_->PushBacktrack(&fail);
4014 Trace new_trace;
4015 start->Emit(this, &new_trace);
4016 macro_assembler_->BindBlock(&fail);
4017 macro_assembler_->Fail();
4018 while (!work_list.is_empty()) {
4019 work_list.RemoveLast()->Emit(this, &new_trace);
4020 }
4021 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
4022
4023 Function& function = macro_assembler->GetCode(pattern);
4024
4025 if (FLAG_trace_irregexp) {
4026 macro_assembler->PrintBlocks();
4027 }
4028
4029 return RegExpEngine::CompilationResult(function);
1748 } 4030 }
1749 4031
1750 4032
1751 RegExpEngine::CompilationResult RegExpEngine::Compile( 4033 RegExpEngine::CompilationResult RegExpEngine::Compile(
1752 RegExpCompileData* data, 4034 RegExpCompileData* data,
1753 bool ignore_case, 4035 bool ignore_case,
1754 bool is_global, 4036 bool is_global,
1755 bool is_multiline, 4037 bool is_multiline,
1756 const String& pattern, 4038 const String& pattern,
1757 const String& sample_subject, 4039 const String& sample_subject,
1758 bool is_ascii) { 4040 bool is_ascii) {
1759 Isolate* isolate = Isolate::Current(); 4041 Isolate* isolate = Isolate::Current();
1760 4042
1761 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 4043 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
1762 return IrregexpRegExpTooBig(); 4044 return IrregexpRegExpTooBig();
1763 } 4045 }
1764 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii); 4046 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
1765 4047
1766 // TODO(jgruber): Character frequency sampling. 4048 // TODO(jgruber): Character frequency sampling.
1767 4049
1768 // Wrap the body of the regexp in capture #0. 4050 // Wrap the body of the regexp in capture #0.
1769 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, 4051 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
1770 0, 4052 0,
1771 &compiler, 4053 &compiler,
1772 compiler.accept()); 4054 compiler.accept());
1773 4055
1774 RegExpNode* node = captured_body; 4056 RegExpNode* node = captured_body;
1775 bool is_start_anchored = data->tree->IsAnchoredAtStart(); 4057 bool is_start_anchored = data->tree->IsAnchoredAtStart();
4058 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
4059 intptr_t max_length = data->tree->max_match();
1776 if (!is_start_anchored) { 4060 if (!is_start_anchored) {
1777 // Add a .*? at the beginning, outside the body capture, unless 4061 // Add a .*? at the beginning, outside the body capture, unless
1778 // this expression is anchored at the beginning. 4062 // this expression is anchored at the beginning.
1779 RegExpNode* loop_node = 4063 RegExpNode* loop_node =
1780 RegExpQuantifier::ToNode(0, 4064 RegExpQuantifier::ToNode(0,
1781 RegExpTree::kInfinity, 4065 RegExpTree::kInfinity,
1782 false, 4066 false,
1783 new(isolate) RegExpCharacterClass('*'), 4067 new(isolate) RegExpCharacterClass('*'),
1784 &compiler, 4068 &compiler,
1785 captured_body, 4069 captured_body,
(...skipping 23 matching lines...) Expand all
1809 4093
1810 if (node == NULL) node = new(isolate) EndNode(EndNode::BACKTRACK, isolate); 4094 if (node == NULL) node = new(isolate) EndNode(EndNode::BACKTRACK, isolate);
1811 data->node = node; 4095 data->node = node;
1812 Analysis analysis(ignore_case, is_ascii); 4096 Analysis analysis(ignore_case, is_ascii);
1813 analysis.EnsureAnalyzed(node); 4097 analysis.EnsureAnalyzed(node);
1814 if (analysis.has_failed()) { 4098 if (analysis.has_failed()) {
1815 const char* error_message = analysis.error_message(); 4099 const char* error_message = analysis.error_message();
1816 return CompilationResult(error_message); 4100 return CompilationResult(error_message);
1817 } 4101 }
1818 4102
1819 // TODO(jgruber): Assemble native code. 4103 // Native regexp implementation.
1820 4104
1821 return RegExpEngine::CompilationResult(""); 4105 IRRegExpMacroAssembler::Mode mode =
1822 } 4106 is_ascii ? IRRegExpMacroAssembler::ASCII
1823 4107 : IRRegExpMacroAssembler::UC16;
1824 4108
4109 IRRegExpMacroAssembler macro_assembler(mode, data->capture_count, isolate);
4110
4111 // Inserted here, instead of in Assembler, because it depends on information
4112 // in the AST that isn't replicated in the Node structure.
4113 static const intptr_t kMaxBacksearchLimit = 1024;
4114 if (is_end_anchored &&
4115 !is_start_anchored &&
4116 max_length < kMaxBacksearchLimit) {
4117 macro_assembler.SetCurrentPositionFromEnd(max_length);
4118 }
4119
4120 if (is_global) {
4121 macro_assembler.set_global_mode(
4122 (data->tree->min_match() > 0)
4123 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK
4124 : RegExpMacroAssembler::GLOBAL);
4125 }
4126
4127 return compiler.Assemble(&macro_assembler,
4128 node,
4129 data->capture_count,
4130 pattern);
4131 }
4132
4133 RawJSRegExp* RegExpEngine::New(const String& pattern,
4134 bool multi_line,
4135 bool ignore_case) {
4136 JSRegExp& regexp = JSRegExp::Handle(JSRegExp::New(0));
4137
4138 regexp.set_pattern(pattern);
4139
4140 if (multi_line) regexp.set_is_multi_line();
4141 if (ignore_case) regexp.set_is_ignore_case();
4142
4143 // TODO(jgruber): We might want to use normal string searching algorithms
4144 // for simple patterns.
4145 regexp.set_is_complex();
4146 regexp.set_is_global(); // All dart regexps are global.
4147
4148 regexp.set_one_byte_function(Function::Handle());
4149 regexp.set_two_byte_function(Function::Handle());
4150
4151 return regexp.raw();
4152 }
4153
4154
4155 void BoyerMoorePositionInfo::Set(intptr_t character) {
4156 SetInterval(Interval(character, character));
4157 }
4158
4159
4160 ContainedInLattice AddRange(ContainedInLattice containment,
4161 const int* ranges,
4162 intptr_t ranges_length,
4163 Interval new_range) {
4164 ASSERT((ranges_length & 1) == 1);
4165 ASSERT(ranges[ranges_length - 1] == Utf16::kMaxCodeUnit + 1);
4166 if (containment == kLatticeUnknown) return containment;
4167 bool inside = false;
4168 intptr_t last = 0;
4169 for (intptr_t i = 0; i < ranges_length;
4170 inside = !inside, last = ranges[i], i++) {
4171 // Consider the range from last to ranges[i].
4172 // We haven't got to the new range yet.
4173 if (ranges[i] <= new_range.from()) continue;
4174 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
4175 // inclusive, but the values in ranges are not.
4176 if (last <= new_range.from() && new_range.to() < ranges[i]) {
4177 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
4178 }
4179 return kLatticeUnknown;
4180 }
4181 return containment;
4182 }
4183
4184
4185 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
4186 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
4187 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
4188 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
4189 surrogate_ =
4190 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
4191 if (interval.to() - interval.from() >= kMapSize - 1) {
4192 if (map_count_ != kMapSize) {
4193 map_count_ = kMapSize;
4194 for (intptr_t i = 0; i < kMapSize; i++) map_->At(i) = true;
4195 }
4196 return;
4197 }
4198 for (intptr_t i = interval.from(); i <= interval.to(); i++) {
4199 intptr_t mod_character = (i & kMask);
4200 if (!map_->At(mod_character)) {
4201 map_count_++;
4202 map_->At(mod_character) = true;
4203 }
4204 if (map_count_ == kMapSize) return;
4205 }
4206 }
4207
4208
4209 void BoyerMoorePositionInfo::SetAll() {
4210 s_ = w_ = d_ = kLatticeUnknown;
4211 if (map_count_ != kMapSize) {
4212 map_count_ = kMapSize;
4213 for (intptr_t i = 0; i < kMapSize; i++) map_->At(i) = true;
4214 }
4215 }
4216
4217
4218 BoyerMooreLookahead::BoyerMooreLookahead(
4219 intptr_t length, RegExpCompiler* compiler, Isolate* isolate)
4220 : length_(length),
4221 compiler_(compiler) {
4222 if (compiler->ascii()) {
4223 max_char_ = Symbols::kMaxOneCharCodeSymbol;
4224 } else {
4225 max_char_ = Utf16::kMaxCodeUnit;
4226 }
4227 bitmaps_ = new(isolate) ZoneGrowableArray<BoyerMoorePositionInfo*>(length);
4228 for (intptr_t i = 0; i < length; i++) {
4229 bitmaps_->Add(new(isolate) BoyerMoorePositionInfo(isolate));
4230 }
4231 }
4232
4233
4234 // Take all the characters that will not prevent a successful match if they
4235 // occur in the subject string in the range between min_lookahead and
4236 // max_lookahead (inclusive) measured from the current position. If the
4237 // character at max_lookahead offset is not one of these characters, then we
4238 // can safely skip forwards by the number of characters in the range.
4239 intptr_t BoyerMooreLookahead::GetSkipTable(intptr_t min_lookahead,
4240 intptr_t max_lookahead,
4241 TypedData* boolean_skip_table) {
4242 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
4243
4244 const intptr_t kSkipArrayEntry = 0;
4245 const intptr_t kDontSkipArrayEntry = 1;
4246
4247 for (intptr_t i = 0; i < kSize; i++) {
4248 boolean_skip_table->SetUint8(i, kSkipArrayEntry);
4249 }
4250 intptr_t skip = max_lookahead + 1 - min_lookahead;
4251
4252 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {
4253 BoyerMoorePositionInfo* map = bitmaps_->At(i);
4254 for (intptr_t j = 0; j < kSize; j++) {
4255 if (map->at(j)) {
4256 boolean_skip_table->SetUint8(j, kDontSkipArrayEntry);
4257 }
4258 }
4259 }
4260
4261 return skip;
4262 }
4263
4264
4265 // Find the longest range of lookahead that has the fewest number of different
4266 // characters that can occur at a given position. Since we are optimizing two
4267 // different parameters at once this is a tradeoff.
4268 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
4269 intptr_t biggest_points = 0;
4270 // If more than 32 characters out of 128 can occur it is unlikely that we can
4271 // be lucky enough to step forwards much of the time.
4272 const intptr_t kMaxMax = 32;
4273 for (intptr_t max_number_of_chars = 4;
4274 max_number_of_chars < kMaxMax;
4275 max_number_of_chars *= 2) {
4276 biggest_points =
4277 FindBestInterval(max_number_of_chars, biggest_points, from, to);
4278 }
4279 if (biggest_points == 0) return false;
4280 return true;
4281 }
4282
4283
4284 // Find the highest-points range between 0 and length_ where the character
4285 // information is not too vague. 'Too vague' means that there are more than
4286 // max_number_of_chars that can occur at this position. Calculates the number
4287 // of points as the product of width-of-the-range and
4288 // probability-of-finding-one-of-the-characters, where the probability is
4289 // calculated using the frequency distribution of the sample subject string.
4290 intptr_t BoyerMooreLookahead::FindBestInterval(
4291 intptr_t max_number_of_chars,
4292 intptr_t old_biggest_points,
4293 int* from,
4294 int* to) {
4295 intptr_t biggest_points = old_biggest_points;
4296 static const intptr_t kSize = RegExpMacroAssembler::kTableSize;
4297 for (intptr_t i = 0; i < length_; ) {
4298 while (i < length_ && Count(i) > max_number_of_chars) i++;
4299 if (i == length_) break;
4300 intptr_t remembered_from = i;
4301 bool union_map[kSize];
4302 for (intptr_t j = 0; j < kSize; j++) union_map[j] = false;
4303 while (i < length_ && Count(i) <= max_number_of_chars) {
4304 BoyerMoorePositionInfo* map = bitmaps_->At(i);
4305 for (intptr_t j = 0; j < kSize; j++) union_map[j] |= map->at(j);
4306 i++;
4307 }
4308 intptr_t frequency = 0;
4309 for (intptr_t j = 0; j < kSize; j++) {
4310 if (union_map[j]) {
4311 // Add 1 to the frequency to give a small per-character boost for
4312 // the cases where our sampling is not good enough and many
4313 // characters have a frequency of zero. This means the frequency
4314 // can theoretically be up to 2*kSize though we treat it mostly as
4315 // a fraction of kSize.
4316
4317 // TODO(jgruber): Use frequency sampling once implemented.
4318 }
4319 }
4320 // We use the probability of skipping times the distance we are skipping to
4321 // judge the effectiveness of this. Actually we have a cut-off: By
4322 // dividing by 2 we switch off the skipping if the probability of skipping
4323 // is less than 50%. This is because the multibyte mask-and-compare
4324 // skipping in quickcheck is more likely to do well on this case.
4325 bool in_quickcheck_range = ((i - remembered_from < 4) ||
4326 (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2));
4327 // Called 'probability' but it is only a rough estimate and can actually
4328 // be outside the 0-kSize range.
4329 intptr_t probability =
4330 (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
4331 intptr_t points = (i - remembered_from) * probability;
4332 if (points > biggest_points) {
4333 *from = remembered_from;
4334 *to = i - 1;
4335 biggest_points = points;
4336 }
4337 }
4338 return biggest_points;
4339 }
4340
4341
4342 // See comment above on the implementation of GetSkipTable.
4343 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
4344 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
4345
4346 intptr_t min_lookahead = 0;
4347 intptr_t max_lookahead = 0;
4348
4349 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false;
4350
4351 bool found_single_character = false;
4352 intptr_t single_character = 0;
4353 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {
4354 BoyerMoorePositionInfo* map = bitmaps_->At(i);
4355 if (map->map_count() > 1 ||
4356 (found_single_character && map->map_count() != 0)) {
4357 found_single_character = false;
4358 break;
4359 }
4360 for (intptr_t j = 0; j < kSize; j++) {
4361 if (map->at(j)) {
4362 found_single_character = true;
4363 single_character = j;
4364 break;
4365 }
4366 }
4367 }
4368
4369 intptr_t lookahead_width = max_lookahead + 1 - min_lookahead;
4370
4371 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
4372 // The mask-compare can probably handle this better.
4373 return false;
4374 }
4375
4376 if (found_single_character) {
4377 BlockLabel cont, again;
4378 masm->BindBlock(&again);
4379 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
4380 if (max_char_ > kSize) {
4381 masm->CheckCharacterAfterAnd(single_character,
4382 RegExpMacroAssembler::kTableMask,
4383 &cont);
4384 } else {
4385 masm->CheckCharacter(single_character, &cont);
4386 }
4387 masm->AdvanceCurrentPosition(lookahead_width);
4388 masm->GoTo(&again);
4389 masm->BindBlock(&cont);
4390 return true;
4391 }
4392
4393 TypedData& boolean_skip_table = TypedData::ZoneHandle(
4394 compiler_->isolate(),
4395 TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));
4396 intptr_t skip_distance = GetSkipTable(
4397 min_lookahead, max_lookahead, &boolean_skip_table);
4398 ASSERT(skip_distance != 0);
4399
4400 BlockLabel cont, again;
4401
4402 masm->BindBlock(&again);
4403 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
4404 masm->CheckBitInTable(boolean_skip_table, &cont);
4405 masm->AdvanceCurrentPosition(skip_distance);
4406 masm->Jump(&again);
4407 masm->BindBlock(&cont);
4408
4409 return true;
4410 }
4411
4412
1825 // ------------------------------------------------------------------- 4413 // -------------------------------------------------------------------
1826 // Analysis 4414 // Analysis
1827 4415
1828 4416
1829 void Analysis::EnsureAnalyzed(RegExpNode* that) { 4417 void Analysis::EnsureAnalyzed(RegExpNode* that) {
1830 // TODO(jgruber): Stack overflow check. 4418 // TODO(jgruber): Stack overflow check.
1831 if (that->info()->been_analyzed || that->info()->being_analyzed) 4419 if (that->info()->been_analyzed || that->info()->being_analyzed)
1832 return; 4420 return;
1833 that->info()->being_analyzed = true; 4421 that->info()->being_analyzed = true;
1834 that->Accept(this); 4422 that->Accept(this);
(...skipping 169 matching lines...) Expand 10 before | Expand all | Expand 10 after
2004 4592
2005 4593
2006 void DotPrinter::PrintAttributes(RegExpNode* that) { 4594 void DotPrinter::PrintAttributes(RegExpNode* that) {
2007 OS::Print(" a%p [shape=Mrecord, color=grey, fontcolor=grey, " 4595 OS::Print(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
2008 "margin=0.1, fontsize=10, label=\"{", that); 4596 "margin=0.1, fontsize=10, label=\"{", that);
2009 AttributePrinter printer; 4597 AttributePrinter printer;
2010 NodeInfo* info = that->info(); 4598 NodeInfo* info = that->info();
2011 printer.PrintBit("NI", info->follows_newline_interest); 4599 printer.PrintBit("NI", info->follows_newline_interest);
2012 printer.PrintBit("WI", info->follows_word_interest); 4600 printer.PrintBit("WI", info->follows_word_interest);
2013 printer.PrintBit("SI", info->follows_start_interest); 4601 printer.PrintBit("SI", info->follows_start_interest);
2014 Label* label = that->label(); 4602 BlockLabel* label = that->label();
2015 if (label->IsBound()) 4603 if (label->IsBound())
2016 printer.PrintPositive("@", label->Position()); 4604 printer.PrintPositive("@", label->Position());
2017 OS::Print("}\"];\n" 4605 OS::Print("}\"];\n"
2018 " a%p -> n%p [style=dashed, color=grey, arrowhead=none];\n", 4606 " a%p -> n%p [style=dashed, color=grey, arrowhead=none];\n",
2019 that, that); 4607 that, that);
2020 } 4608 }
2021 4609
2022 4610
2023 static const bool kPrintDispatchTable = false; 4611 static const bool kPrintDispatchTable = false;
2024 void DotPrinter::VisitChoice(ChoiceNode* that) { 4612 void DotPrinter::VisitChoice(ChoiceNode* that) {
(...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after
2166 RegExpNode* node, 4754 RegExpNode* node,
2167 bool ignore_case) { 4755 bool ignore_case) {
2168 DotPrinter printer(ignore_case); 4756 DotPrinter printer(ignore_case);
2169 printer.PrintNode(label, node); 4757 printer.PrintNode(label, node);
2170 } 4758 }
2171 4759
2172 4760
2173 #endif // DEBUG 4761 #endif // DEBUG
2174 4762
2175 } // namespace dart 4763 } // namespace dart
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698