Index: src/jsregexp.cc |
=================================================================== |
--- src/jsregexp.cc (revision 939) |
+++ src/jsregexp.cc (working copy) |
@@ -962,10 +962,12 @@ |
} |
Handle<ByteArray> byte_codes = IrregexpByteCode(irregexp); |
- Handle<String> two_byte_subject = CachedStringToTwoByte(subject); |
+ if (!subject->IsFlat(StringShape(*subject))) { |
+ FlattenString(subject); |
Lasse Reichstein
2008/12/09 07:43:07
The string is also flattened in the IA32 branch (l
|
+ } |
rc = IrregexpInterpreter::Match(byte_codes, |
- two_byte_subject, |
+ subject, |
offsets_vector, |
previous_index); |
break; |
@@ -1191,7 +1193,7 @@ |
class RegExpCompiler { |
public: |
- RegExpCompiler(int capture_count, bool ignore_case); |
+ RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii); |
int AllocateRegister() { return next_register_++; } |
@@ -1215,6 +1217,7 @@ |
inline void DecrementRecursionDepth() { recursion_depth_--; } |
inline bool ignore_case() { return ignore_case_; } |
+ inline bool ascii() { return ascii_; } |
private: |
EndNode* accept_; |
@@ -1223,6 +1226,7 @@ |
int recursion_depth_; |
RegExpMacroAssembler* macro_assembler_; |
bool ignore_case_; |
+ bool ascii_; |
}; |
@@ -1239,11 +1243,12 @@ |
// Attempts to compile the regexp using an Irregexp code generator. Returns |
// a fixed array or a null handle depending on whether it succeeded. |
-RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case) |
+RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii) |
: next_register_(2 * (capture_count + 1)), |
work_list_(NULL), |
recursion_depth_(0), |
- ignore_case_(ignore_case) { |
+ ignore_case_(ignore_case), |
+ ascii_(ascii) { |
accept_ = new EndNode(EndNode::ACCEPT); |
} |
@@ -1682,7 +1687,6 @@ |
chars[0], |
chars[1], |
on_failure)) { |
- ok.Unuse(); |
} else { |
macro_assembler->CheckCharacter(chars[0], &ok); |
macro_assembler->CheckNotCharacter(chars[1], on_failure); |
@@ -1711,8 +1715,10 @@ |
RegExpCharacterClass* cc, |
int cp_offset, |
Label* on_failure, |
- bool check_offset) { |
+ bool check_offset, |
+ bool ascii) { |
ZoneList<CharacterRange>* ranges = cc->ranges(); |
+ const int max_char = ascii ? 0x7f : 0xffff; |
Lasse Reichstein
2008/12/09 07:43:07
Use String::kMaxAsciiCharCode instead of 0x7f?
|
Label success; |
@@ -1721,16 +1727,27 @@ |
int range_count = ranges->length(); |
- if (range_count == 0) { |
+ int last_valid_range = range_count - 1; |
+ while (last_valid_range >= 0) { |
+ CharacterRange& range = ranges->at(last_valid_range); |
+ if (range.from() <= max_char) { |
+ break; |
+ } |
+ last_valid_range--; |
+ } |
+ |
+ if (last_valid_range < 0) { |
if (!cc->is_negated()) { |
+ // TODO(plesner): We can remove this when the node level does our |
+ // ASCII optimizations for us. |
macro_assembler->GoTo(on_failure); |
} |
return; |
} |
- if (range_count == 1 && |
+ if (last_valid_range == 0 && |
!cc->is_negated() && |
- ranges->at(0).IsEverything(0xffff)) { |
+ ranges->at(0).IsEverything(max_char)) { |
// This is a common case hit by non-anchored expressions. |
// TODO(erikcorry): We should have a macro assembler instruction that just |
// checks for end of string without loading the character. |
@@ -1748,18 +1765,22 @@ |
macro_assembler->LoadCurrentCharacterUnchecked(cp_offset); |
} |
- for (int i = 0; i < range_count - 1; i++) { |
+ for (int i = 0; i <= last_valid_range; i++) { |
CharacterRange& range = ranges->at(i); |
Label next_range; |
uc16 from = range.from(); |
uc16 to = range.to(); |
+ if (from > max_char) { |
+ continue; |
+ } |
+ if (to > max_char) to = max_char; |
if (to == from) { |
macro_assembler->CheckCharacter(to, char_is_in_class); |
} else { |
if (from != 0) { |
macro_assembler->CheckCharacterLT(from, &next_range); |
Lasse Reichstein
2008/12/09 07:43:07
How about a CheckCharacterRange(from, to, char_is_
|
} |
- if (to != 0xffff) { |
+ if (to != max_char) { |
macro_assembler->CheckCharacterLT(to + 1, char_is_in_class); |
} else { |
macro_assembler->GoTo(char_is_in_class); |
@@ -1768,10 +1789,13 @@ |
macro_assembler->Bind(&next_range); |
} |
- CharacterRange& range = ranges->at(range_count - 1); |
+ CharacterRange& range = ranges->at(last_valid_range); |
uc16 from = range.from(); |
uc16 to = range.to(); |
+ if (to > max_char) to = max_char; |
+ ASSERT(to >= from); |
+ |
if (to == from) { |
if (cc->is_negated()) { |
macro_assembler->CheckCharacter(to, on_failure); |
@@ -1875,7 +1899,25 @@ |
macro_assembler->GoTo(backtrack); |
return true; |
} |
- // First, handle straight character matches. |
+ // First check for non-ASCII text. |
+ // TODO(plesner): We should do this at node level. |
+ if (compiler->ascii()) { |
+ for (int i = element_count - 1; i >= 0; i--) { |
+ TextElement elm = elms_->at(i); |
+ if (elm.type == TextElement::ATOM) { |
+ Vector<const uc16> quarks = elm.data.u_atom->data(); |
+ for (int j = quarks.length() - 1; j >= 0; j--) { |
+ if (quarks[j] > 0x7f) { |
Lasse Reichstein
2008/12/09 07:43:07
Use String::kMaxAsciiCharCode
|
+ macro_assembler->GoTo(backtrack); |
+ return true; |
+ } |
+ } |
+ } else { |
+ ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); |
+ } |
+ } |
+ } |
+ // Second, handle straight character matches. |
int checked_up_to = -1; |
for (int i = element_count - 1; i >= 0; i--) { |
TextElement elm = elms_->at(i); |
@@ -1902,7 +1944,7 @@ |
ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); |
} |
} |
- // Second, handle case independent letter matches if any. |
+ // Third, handle case independent letter matches if any. |
if (compiler->ignore_case()) { |
for (int i = element_count - 1; i >= 0; i--) { |
TextElement elm = elms_->at(i); |
@@ -1930,7 +1972,8 @@ |
cc, |
cp_offset, |
backtrack, |
- checked_up_to < cp_offset); |
+ checked_up_to < cp_offset, |
+ compiler->ascii()); |
if (cp_offset > checked_up_to) checked_up_to = cp_offset; |
} |
} |
@@ -3611,7 +3654,7 @@ |
bool is_multiline, |
Handle<String> pattern, |
bool is_ascii) { |
- RegExpCompiler compiler(input->capture_count, ignore_case); |
+ RegExpCompiler compiler(input->capture_count, ignore_case, is_ascii); |
// Wrap the body of the regexp in capture #0. |
RegExpNode* captured_body = RegExpCapture::ToNode(input->tree, |
0, |