Chromium Code Reviews| Index: src/jsregexp.cc |
| =================================================================== |
| --- src/jsregexp.cc (revision 939) |
| +++ src/jsregexp.cc (working copy) |
| @@ -962,10 +962,12 @@ |
| } |
| Handle<ByteArray> byte_codes = IrregexpByteCode(irregexp); |
| - Handle<String> two_byte_subject = CachedStringToTwoByte(subject); |
| + if (!subject->IsFlat(StringShape(*subject))) { |
| + FlattenString(subject); |
|
Lasse Reichstein
2008/12/09 07:43:07
The string is also flattened in the IA32 branch (l
|
| + } |
| rc = IrregexpInterpreter::Match(byte_codes, |
| - two_byte_subject, |
| + subject, |
| offsets_vector, |
| previous_index); |
| break; |
| @@ -1191,7 +1193,7 @@ |
| class RegExpCompiler { |
| public: |
| - RegExpCompiler(int capture_count, bool ignore_case); |
| + RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii); |
| int AllocateRegister() { return next_register_++; } |
| @@ -1215,6 +1217,7 @@ |
| inline void DecrementRecursionDepth() { recursion_depth_--; } |
| inline bool ignore_case() { return ignore_case_; } |
| + inline bool ascii() { return ascii_; } |
| private: |
| EndNode* accept_; |
| @@ -1223,6 +1226,7 @@ |
| int recursion_depth_; |
| RegExpMacroAssembler* macro_assembler_; |
| bool ignore_case_; |
| + bool ascii_; |
| }; |
| @@ -1239,11 +1243,12 @@ |
| // Attempts to compile the regexp using an Irregexp code generator. Returns |
| // a fixed array or a null handle depending on whether it succeeded. |
| -RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case) |
| +RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii) |
| : next_register_(2 * (capture_count + 1)), |
| work_list_(NULL), |
| recursion_depth_(0), |
| - ignore_case_(ignore_case) { |
| + ignore_case_(ignore_case), |
| + ascii_(ascii) { |
| accept_ = new EndNode(EndNode::ACCEPT); |
| } |
| @@ -1682,7 +1687,6 @@ |
| chars[0], |
| chars[1], |
| on_failure)) { |
| - ok.Unuse(); |
| } else { |
| macro_assembler->CheckCharacter(chars[0], &ok); |
| macro_assembler->CheckNotCharacter(chars[1], on_failure); |
| @@ -1711,8 +1715,10 @@ |
| RegExpCharacterClass* cc, |
| int cp_offset, |
| Label* on_failure, |
| - bool check_offset) { |
| + bool check_offset, |
| + bool ascii) { |
| ZoneList<CharacterRange>* ranges = cc->ranges(); |
| + const int max_char = ascii ? 0x7f : 0xffff; |
|
Lasse Reichstein
2008/12/09 07:43:07
Use String::kMaxAsciiCharCode instead of 0x7f?
|
| Label success; |
| @@ -1721,16 +1727,27 @@ |
| int range_count = ranges->length(); |
| - if (range_count == 0) { |
| + int last_valid_range = range_count - 1; |
| + while (last_valid_range >= 0) { |
| + CharacterRange& range = ranges->at(last_valid_range); |
| + if (range.from() <= max_char) { |
| + break; |
| + } |
| + last_valid_range--; |
| + } |
| + |
| + if (last_valid_range < 0) { |
| if (!cc->is_negated()) { |
| + // TODO(plesner): We can remove this when the node level does our |
| + // ASCII optimizations for us. |
| macro_assembler->GoTo(on_failure); |
| } |
| return; |
| } |
| - if (range_count == 1 && |
| + if (last_valid_range == 0 && |
| !cc->is_negated() && |
| - ranges->at(0).IsEverything(0xffff)) { |
| + ranges->at(0).IsEverything(max_char)) { |
| // This is a common case hit by non-anchored expressions. |
| // TODO(erikcorry): We should have a macro assembler instruction that just |
| // checks for end of string without loading the character. |
| @@ -1748,18 +1765,22 @@ |
| macro_assembler->LoadCurrentCharacterUnchecked(cp_offset); |
| } |
| - for (int i = 0; i < range_count - 1; i++) { |
| + for (int i = 0; i <= last_valid_range; i++) { |
| CharacterRange& range = ranges->at(i); |
| Label next_range; |
| uc16 from = range.from(); |
| uc16 to = range.to(); |
| + if (from > max_char) { |
| + continue; |
| + } |
| + if (to > max_char) to = max_char; |
| if (to == from) { |
| macro_assembler->CheckCharacter(to, char_is_in_class); |
| } else { |
| if (from != 0) { |
| macro_assembler->CheckCharacterLT(from, &next_range); |
|
Lasse Reichstein
2008/12/09 07:43:07
How about a CheckCharacterRange(from, to, char_is_
|
| } |
| - if (to != 0xffff) { |
| + if (to != max_char) { |
| macro_assembler->CheckCharacterLT(to + 1, char_is_in_class); |
| } else { |
| macro_assembler->GoTo(char_is_in_class); |
| @@ -1768,10 +1789,13 @@ |
| macro_assembler->Bind(&next_range); |
| } |
| - CharacterRange& range = ranges->at(range_count - 1); |
| + CharacterRange& range = ranges->at(last_valid_range); |
| uc16 from = range.from(); |
| uc16 to = range.to(); |
| + if (to > max_char) to = max_char; |
| + ASSERT(to >= from); |
| + |
| if (to == from) { |
| if (cc->is_negated()) { |
| macro_assembler->CheckCharacter(to, on_failure); |
| @@ -1875,7 +1899,25 @@ |
| macro_assembler->GoTo(backtrack); |
| return true; |
| } |
| - // First, handle straight character matches. |
| + // First check for non-ASCII text. |
| + // TODO(plesner): We should do this at node level. |
| + if (compiler->ascii()) { |
| + for (int i = element_count - 1; i >= 0; i--) { |
| + TextElement elm = elms_->at(i); |
| + if (elm.type == TextElement::ATOM) { |
| + Vector<const uc16> quarks = elm.data.u_atom->data(); |
| + for (int j = quarks.length() - 1; j >= 0; j--) { |
| + if (quarks[j] > 0x7f) { |
|
Lasse Reichstein
2008/12/09 07:43:07
Use String::kMaxAsciiCharCode
|
| + macro_assembler->GoTo(backtrack); |
| + return true; |
| + } |
| + } |
| + } else { |
| + ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); |
| + } |
| + } |
| + } |
| + // Second, handle straight character matches. |
| int checked_up_to = -1; |
| for (int i = element_count - 1; i >= 0; i--) { |
| TextElement elm = elms_->at(i); |
| @@ -1902,7 +1944,7 @@ |
| ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); |
| } |
| } |
| - // Second, handle case independent letter matches if any. |
| + // Third, handle case independent letter matches if any. |
| if (compiler->ignore_case()) { |
| for (int i = element_count - 1; i >= 0; i--) { |
| TextElement elm = elms_->at(i); |
| @@ -1930,7 +1972,8 @@ |
| cc, |
| cp_offset, |
| backtrack, |
| - checked_up_to < cp_offset); |
| + checked_up_to < cp_offset, |
| + compiler->ascii()); |
| if (cp_offset > checked_up_to) checked_up_to = cp_offset; |
| } |
| } |
| @@ -3611,7 +3654,7 @@ |
| bool is_multiline, |
| Handle<String> pattern, |
| bool is_ascii) { |
| - RegExpCompiler compiler(input->capture_count, ignore_case); |
| + RegExpCompiler compiler(input->capture_count, ignore_case, is_ascii); |
| // Wrap the body of the regexp in capture #0. |
| RegExpNode* captured_body = RegExpCapture::ToNode(input->tree, |
| 0, |