Index: src/jsregexp.cc |
=================================================================== |
--- src/jsregexp.cc (revision 11134) |
+++ src/jsregexp.cc (working copy) |
@@ -1534,6 +1534,276 @@ |
} |
+static void EmitBoundaryTest(RegExpMacroAssembler* masm, |
+ int border, |
+ Label* fall_through, |
+ Label* above_or_equal, |
+ Label* below) { |
+ if (below != fall_through) { |
+ masm->CheckCharacterLT(border, below); |
+ if (above_or_equal != fall_through) masm->GoTo(above_or_equal); |
+ } else { |
+ masm->CheckCharacterGT(border - 1, above_or_equal); |
+ } |
+} |
+ |
+ |
+static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, |
+ int first, |
+ int last, |
+ Label* fall_through, |
+ Label* even_label, |
+ Label* odd_label) { |
+ if (even_label == fall_through) { |
+ if (first == last) { |
+ masm->CheckNotCharacter(first, odd_label); |
+ } else { |
+ masm->CheckCharacterNotInRange(first, last, odd_label); |
+ } |
+ } else { |
+ if (first == last) { |
+ masm->CheckCharacter(first, even_label); |
+ } else { |
+ masm->CheckCharacterInRange(first, last, even_label); |
+ } |
+ if (odd_label != fall_through) masm->GoTo(odd_label); |
+ } |
+} |
+ |
+ |
+static void EmitUseLookupTable( |
+ RegExpMacroAssembler* masm, |
+ ZoneList<int>* ranges2, |
+ int start_index, |
+ int end_index, |
+ int min_char, |
+ Label* fall_through, |
+ Label* even_label, |
+ Label* odd_label) { |
+ static const int kSize = RegExpMacroAssembler::kTableSize; |
+ static const int kMask = RegExpMacroAssembler::kTableMask; |
+ |
+ char templ[kSize]; |
+ Label* on_bit_set; |
+ Label* on_bit_clear; |
+ int bit; |
+ if (even_label == fall_through) { |
+ on_bit_set = odd_label; |
+ on_bit_clear = even_label; |
+ bit = 1; |
+ } else { |
+ on_bit_set = even_label; |
+ on_bit_clear = odd_label; |
+ bit = 0; |
+ } |
+ int base = (min_char & ~kMask); |
+ for (int i = 0; i < (ranges2->at(start_index) & kMask) && i < kSize; i++) { |
+ templ[i] = bit; |
+ } |
+ int j = 0; |
+ bit ^= 1; |
+ for (int i = start_index; i < end_index; i++) { |
+ for (j = (ranges2->at(i) & kMask); |
+ j < ranges2->at(i + 1) - base && j < kSize; |
+ j++) { |
+ templ[j] = bit; |
+ } |
+ bit ^= 1; |
+ } |
+ for (int i = j; i < kSize; i++) { |
+ templ[i] = bit; |
+ } |
+ // TODO(erikcorry): Cache these. |
+ Handle<ByteArray> ba = FACTORY->NewByteArray(kSize, TENURED); |
+ for (int i = 0; i < kSize; i++) { |
+ ba->set(i, templ[i]); |
+ } |
+ masm->CheckBitInTable(ba, on_bit_set); |
+ if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear); |
+} |
+ |
+ |
+// Gets a series of segments representing a character class. If the character |
+// is in the range between even and an odd boundary (counting from start_index) |
+// then go to even_label, otherwise go to odd_label. We already know that |
+// the character is in the range of min_char to max_char inclusive. Either |
+// label can be NULL indicating backtracking. Either label can also be equal to |
+// the fall_through label. |
+static void GenerateBranches(RegExpMacroAssembler* masm, |
+ ZoneList<int>* ranges2, |
+ int start_index, |
+ int end_index, |
+ uc16 min_char, |
+ uc16 max_char, |
+ Label* fall_through, |
+ Label* even_label, |
+ Label* odd_label) { |
+ int first = ranges2->at(start_index); |
+ int last = ranges2->at(end_index) - 1; |
+ |
+ // Just need to test if the character is before or on-or-after |
+ // a particular character. |
+ if (start_index == end_index) { |
+ EmitBoundaryTest(masm, first, fall_through, even_label, odd_label); |
+ return; |
+ } |
+ |
+ // Another almost trivial case: There is one interval in the middle that is |
+ // different from the end intervals. |
+ if (start_index + 1 == end_index) { |
+ EmitDoubleBoundaryTest( |
+ masm, first, last, fall_through, even_label, odd_label); |
+ return; |
+ } |
+ |
+ // It's not worth using table lookup if there are very few intervals in the |
+ // character class. |
+ if (end_index - start_index <= 6) { |
+ typedef enum { kSingleCharacter, kRange, kDone } SearchOrder; |
+ // It is faster to test for individual characters, so we look for those |
+ // first, then try arbitrary ranges in the second round. |
+ for (SearchOrder step = kSingleCharacter; |
+ step < kDone; |
+ step = (step == kSingleCharacter) ? kRange : kDone) { |
+ for (int i = start_index; i < end_index; i++) { |
+ int c = ranges2->at(i); |
+ int next = ranges2->at(i + 1) - 1; |
+ if (step == kRange || c == next) { |
+ bool odd = (((i - start_index) & 1) == 1); |
+ Label* in_range_label = odd ? odd_label : even_label; |
+ Label rest; |
+ EmitDoubleBoundaryTest(masm, c, next, &rest, in_range_label, &rest); |
+ ASSERT(!rest.is_linked()); |
+ // Cut out the single range by rewriting the array. |
+ for (int j = i; j > start_index; j--) { |
+ ranges2->at(j) = ranges2->at(j - 1); |
+ } |
+ for (int j = i + 1; j < end_index; j++) { |
+ ranges2->at(j) = ranges2->at(j + 1); |
+ } |
+ GenerateBranches(masm, |
+ ranges2, |
+ start_index + 1, |
+ end_index - 1, |
+ min_char, |
+ max_char, |
+ fall_through, |
+ even_label, |
+ odd_label); |
+ return; |
+ } |
+ } |
+ } |
+ UNREACHABLE(); |
+ } |
+ |
+ // If there are a lot of intervals in the regexp, then we will use tables to |
+ // determine whether the character is inside or outside the character class. |
+ static const int kBits = RegExpMacroAssembler::kTableSizeBits; |
+ static const int kSize = RegExpMacroAssembler::kTableSize; |
+ static const int kMask = RegExpMacroAssembler::kTableMask; |
+ |
+ if ((max_char >> kBits) == (min_char >> kBits)) { |
+ EmitUseLookupTable(masm, |
+ ranges2, |
+ start_index, |
+ end_index, |
+ min_char, |
+ fall_through, |
+ even_label, |
+ odd_label); |
+ } else { |
+ if ((min_char >> kBits) != (first >> kBits)) { |
+ masm->CheckCharacterLT(first, odd_label); |
+ GenerateBranches(masm, |
+ ranges2, |
+ start_index + 1, |
+ end_index, |
+ first, |
+ max_char, |
+ fall_through, |
+ odd_label, |
+ even_label); |
+ return; |
+ } |
+ |
+ // Unicode case. Split the search space into kSize spaces that are handled |
+ // with recursion. |
+ int new_start_index = start_index; |
+ int border = (first & ~kMask) + kSize; |
+ while (new_start_index <= end_index) { |
+ if (ranges2->at(new_start_index) > border) break; |
+ new_start_index++; |
+ } |
+ // new_start_index is the index of the first edge that is beyond the |
+ // current kSize space. |
+ |
+ // For very large search spaces we do a binary chop search of the non-ASCII |
+ // space instead of just going to the end of the current kSize space. The |
+ // heuristics are complicated a little by the fact that any 128-character |
+ // encoding space can be quickly tested with a table lookup, so we don't |
+ // wish to do binary chop search at a smaller granularity than that. A |
+ // 128-character space can take up a lot of space in the ranges2 array if, |
+ // for example, we only want to match every second character (eg. the lower |
+ // case characters on some Unicode pages). |
+ int binary_chop_index = (end_index + start_index) / 2; |
+ if (border - 1 > String::kMaxAsciiCharCode && |
+ end_index - start_index > (new_start_index - start_index) * 4 && |
+ last - first > kSize * 4 && |
+ binary_chop_index > new_start_index && |
+ ranges2->at(binary_chop_index) > first + 2 * kSize) { |
+ int scan_forward_for_section_border = binary_chop_index;; |
+ int new_border = (ranges2->at(binary_chop_index) | kMask) + 1; |
+ while (scan_forward_for_section_border < end_index) { |
+ if (ranges2->at(scan_forward_for_section_border) > new_border) { |
+ new_start_index = scan_forward_for_section_border; |
+ border = new_border; |
+ break; |
+ } |
+ scan_forward_for_section_border++; |
+ } |
+ } |
+ |
+ ASSERT(new_start_index > start_index); |
+ Label handle_rest; |
+ Label* above = &handle_rest; |
+ int new_end_index = new_start_index - 1; |
+ if (new_start_index > end_index) { |
+ // We didn't find any section that started after the limit, so everything |
+ // above the border is one of the terminal labels. |
+ above = (end_index & 1) != (start_index & 1) ? odd_label : even_label; |
+ } |
+ if (ranges2->at(new_end_index) == border) { |
+ new_end_index--; |
+ } |
+ masm->CheckCharacterGT(border - 1, above); |
+ Label dummy; |
+ GenerateBranches(masm, |
+ ranges2, |
+ start_index, |
+ new_end_index, |
+ min_char, |
+ border - 1, |
+ &dummy, |
+ even_label, |
+ odd_label); |
+ if (handle_rest.is_linked()) { |
+ masm->Bind(&handle_rest); |
+ bool flip = (new_start_index & 1) != (start_index & 1); |
+ GenerateBranches(masm, |
+ ranges2, |
+ new_start_index, |
+ end_index, |
+ border, |
+ max_char, |
+ &dummy, |
+ flip ? odd_label : even_label, |
+ flip ? even_label : odd_label); |
+ } |
+ } |
+} |
+ |
+ |
static void EmitCharClass(RegExpMacroAssembler* macro_assembler, |
RegExpCharacterClass* cc, |
bool ascii, |
@@ -1542,6 +1812,10 @@ |
bool check_offset, |
bool preloaded) { |
ZoneList<CharacterRange>* ranges = cc->ranges(); |
+ if (!CharacterRange::IsCanonical(ranges)) { |
+ CharacterRange::Canonicalize(ranges); |
+ } |
+ |
int max_char; |
if (ascii) { |
max_char = String::kMaxAsciiCharCode; |
@@ -1549,11 +1823,6 @@ |
max_char = String::kMaxUtf16CodeUnit; |
} |
- Label success; |
- |
- Label* char_is_in_class = |
- cc->is_negated() ? on_failure : &success; |
- |
int range_count = ranges->length(); |
int last_valid_range = range_count - 1; |
@@ -1567,8 +1836,6 @@ |
if (last_valid_range < 0) { |
if (!cc->is_negated()) { |
- // TODO(plesner): We can remove this when the node level does our |
- // ASCII optimizations for us. |
macro_assembler->GoTo(on_failure); |
} |
if (check_offset) { |
@@ -1578,6 +1845,18 @@ |
} |
if (last_valid_range == 0 && |
+ ranges->at(0).IsEverything(max_char)) { |
+ if (cc->is_negated()) { |
+ macro_assembler->GoTo(on_failure); |
+ } else { |
+ // This is a common case hit by non-anchored expressions. |
+ if (check_offset) { |
+ macro_assembler->CheckPosition(cp_offset, on_failure); |
+ } |
+ } |
+ return; |
+ } |
+ if (last_valid_range == 0 && |
!cc->is_negated() && |
ranges->at(0).IsEverything(max_char)) { |
// This is a common case hit by non-anchored expressions. |
@@ -1597,64 +1876,45 @@ |
return; |
} |
- for (int i = 0; i < last_valid_range; i++) { |
- CharacterRange& range = ranges->at(i); |
- Label next_range; |
- uc16 from = range.from(); |
- uc16 to = range.to(); |
- if (from > max_char) { |
- continue; |
- } |
- if (to > max_char) to = max_char; |
- if (to == from) { |
- macro_assembler->CheckCharacter(to, char_is_in_class); |
- } else { |
- if (from != 0) { |
- macro_assembler->CheckCharacterLT(from, &next_range); |
- } |
- if (to != max_char) { |
- macro_assembler->CheckCharacterLT(to + 1, char_is_in_class); |
- } else { |
- macro_assembler->GoTo(char_is_in_class); |
- } |
- } |
- macro_assembler->Bind(&next_range); |
- } |
- CharacterRange& range = ranges->at(last_valid_range); |
- uc16 from = range.from(); |
- uc16 to = range.to(); |
+ // A new list with ascending entries. Each entry is a code unit |
+ // where there is a boundary between code units that are part of |
+ // the class and code units that are not. Normally we insert an |
+ // entry at zero which goes to the failure label, but if there |
+ // was already one there we fall through for success on that entry. |
+ // Subsequent entries have alternating meaning (success/failure). |
+ ZoneList<int>* ranges2 = |
+ new ZoneList<int>(last_valid_range); |
- if (to > max_char) to = max_char; |
- ASSERT(to >= from); |
+ bool zeroth_entry_is_failure = true; |
+ if (cc->is_negated()) zeroth_entry_is_failure = false; |
- if (to == from) { |
- if (cc->is_negated()) { |
- macro_assembler->CheckCharacter(to, on_failure); |
- } else { |
- macro_assembler->CheckNotCharacter(to, on_failure); |
- } |
+ int start = ranges->at(0).from(); |
+ if (start == 0) { |
+ zeroth_entry_is_failure = !zeroth_entry_is_failure; |
} else { |
- if (from != 0) { |
- if (cc->is_negated()) { |
- macro_assembler->CheckCharacterLT(from, &success); |
- } else { |
- macro_assembler->CheckCharacterLT(from, on_failure); |
- } |
- } |
- if (to != String::kMaxUtf16CodeUnit) { |
- if (cc->is_negated()) { |
- macro_assembler->CheckCharacterLT(to + 1, on_failure); |
- } else { |
- macro_assembler->CheckCharacterGT(to, on_failure); |
- } |
- } else { |
- if (cc->is_negated()) { |
- macro_assembler->GoTo(on_failure); |
- } |
- } |
+ ranges2->Add(0); |
} |
- macro_assembler->Bind(&success); |
+ |
+ for (int i = 0; i <= last_valid_range; i++) { |
+ CharacterRange& range = ranges->at(i); |
+ ranges2->Add(range.from()); |
+ ranges2->Add(range.to() + 1); |
+ } |
+ int ranges2_length = ranges2->length() - 1; |
+ if (ranges2->at(ranges2_length) >= max_char) ranges2_length--; |
+ |
+ Label fall_through; |
+ GenerateBranches(macro_assembler, |
+ ranges2, |
+ 1, |
+ ranges2_length, |
+ 0, |
+ max_char, |
+ &fall_through, |
+ zeroth_entry_is_failure ? &fall_through : on_failure, |
+ zeroth_entry_is_failure ? on_failure : &fall_through); |
+ macro_assembler->Bind(&fall_through); |
} |