| Index: src/jsregexp.cc
|
| ===================================================================
|
| --- src/jsregexp.cc (revision 11134)
|
| +++ src/jsregexp.cc (working copy)
|
| @@ -1534,6 +1534,276 @@
|
| }
|
|
|
|
|
| +static void EmitBoundaryTest(RegExpMacroAssembler* masm,
|
| + int border,
|
| + Label* fall_through,
|
| + Label* above_or_equal,
|
| + Label* below) {
|
| + if (below != fall_through) {
|
| + masm->CheckCharacterLT(border, below);
|
| + if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
|
| + } else {
|
| + masm->CheckCharacterGT(border - 1, above_or_equal);
|
| + }
|
| +}
|
| +
|
| +
|
| +static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
|
| + int first,
|
| + int last,
|
| + Label* fall_through,
|
| + Label* even_label,
|
| + Label* odd_label) {
|
| + if (even_label == fall_through) {
|
| + if (first == last) {
|
| + masm->CheckNotCharacter(first, odd_label);
|
| + } else {
|
| + masm->CheckCharacterNotInRange(first, last, odd_label);
|
| + }
|
| + } else {
|
| + if (first == last) {
|
| + masm->CheckCharacter(first, even_label);
|
| + } else {
|
| + masm->CheckCharacterInRange(first, last, even_label);
|
| + }
|
| + if (odd_label != fall_through) masm->GoTo(odd_label);
|
| + }
|
| +}
|
| +
|
| +
|
| +static void EmitUseLookupTable(
|
| + RegExpMacroAssembler* masm,
|
| + ZoneList<int>* ranges2,
|
| + int start_index,
|
| + int end_index,
|
| + int min_char,
|
| + Label* fall_through,
|
| + Label* even_label,
|
| + Label* odd_label) {
|
| + static const int kSize = RegExpMacroAssembler::kTableSize;
|
| + static const int kMask = RegExpMacroAssembler::kTableMask;
|
| +
|
| + char templ[kSize];
|
| + Label* on_bit_set;
|
| + Label* on_bit_clear;
|
| + int bit;
|
| + if (even_label == fall_through) {
|
| + on_bit_set = odd_label;
|
| + on_bit_clear = even_label;
|
| + bit = 1;
|
| + } else {
|
| + on_bit_set = even_label;
|
| + on_bit_clear = odd_label;
|
| + bit = 0;
|
| + }
|
| + int base = (min_char & ~kMask);
|
| + for (int i = 0; i < (ranges2->at(start_index) & kMask) && i < kSize; i++) {
|
| + templ[i] = bit;
|
| + }
|
| + int j = 0;
|
| + bit ^= 1;
|
| + for (int i = start_index; i < end_index; i++) {
|
| + for (j = (ranges2->at(i) & kMask);
|
| + j < ranges2->at(i + 1) - base && j < kSize;
|
| + j++) {
|
| + templ[j] = bit;
|
| + }
|
| + bit ^= 1;
|
| + }
|
| + for (int i = j; i < kSize; i++) {
|
| + templ[i] = bit;
|
| + }
|
| + // TODO(erikcorry): Cache these.
|
| + Handle<ByteArray> ba = FACTORY->NewByteArray(kSize, TENURED);
|
| + for (int i = 0; i < kSize; i++) {
|
| + ba->set(i, templ[i]);
|
| + }
|
| + masm->CheckBitInTable(ba, on_bit_set);
|
| + if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
|
| +}
|
| +
|
| +
|
| +// Gets a series of segments representing a character class. If the character
|
| +// is in the range between even and an odd boundary (counting from start_index)
|
| +// then go to even_label, otherwise go to odd_label. We already know that
|
| +// the character is in the range of min_char to max_char inclusive. Either
|
| +// label can be NULL indicating backtracking. Either label can also be equal to
|
| +// the fall_through label.
|
| +static void GenerateBranches(RegExpMacroAssembler* masm,
|
| + ZoneList<int>* ranges2,
|
| + int start_index,
|
| + int end_index,
|
| + uc16 min_char,
|
| + uc16 max_char,
|
| + Label* fall_through,
|
| + Label* even_label,
|
| + Label* odd_label) {
|
| + int first = ranges2->at(start_index);
|
| + int last = ranges2->at(end_index) - 1;
|
| +
|
| + // Just need to test if the character is before or on-or-after
|
| + // a particular character.
|
| + if (start_index == end_index) {
|
| + EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
|
| + return;
|
| + }
|
| +
|
| + // Another almost trivial case: There is one interval in the middle that is
|
| + // different from the end intervals.
|
| + if (start_index + 1 == end_index) {
|
| + EmitDoubleBoundaryTest(
|
| + masm, first, last, fall_through, even_label, odd_label);
|
| + return;
|
| + }
|
| +
|
| + // It's not worth using table lookup if there are very few intervals in the
|
| + // character class.
|
| + if (end_index - start_index <= 6) {
|
| + typedef enum { kSingleCharacter, kRange, kDone } SearchOrder;
|
| + // It is faster to test for individual characters, so we look for those
|
| + // first, then try arbitrary ranges in the second round.
|
| + for (SearchOrder step = kSingleCharacter;
|
| + step < kDone;
|
| + step = (step == kSingleCharacter) ? kRange : kDone) {
|
| + for (int i = start_index; i < end_index; i++) {
|
| + int c = ranges2->at(i);
|
| + int next = ranges2->at(i + 1) - 1;
|
| + if (step == kRange || c == next) {
|
| + bool odd = (((i - start_index) & 1) == 1);
|
| + Label* in_range_label = odd ? odd_label : even_label;
|
| + Label rest;
|
| + EmitDoubleBoundaryTest(masm, c, next, &rest, in_range_label, &rest);
|
| + ASSERT(!rest.is_linked());
|
| + // Cut out the single range by rewriting the array.
|
| + for (int j = i; j > start_index; j--) {
|
| + ranges2->at(j) = ranges2->at(j - 1);
|
| + }
|
| + for (int j = i + 1; j < end_index; j++) {
|
| + ranges2->at(j) = ranges2->at(j + 1);
|
| + }
|
| + GenerateBranches(masm,
|
| + ranges2,
|
| + start_index + 1,
|
| + end_index - 1,
|
| + min_char,
|
| + max_char,
|
| + fall_through,
|
| + even_label,
|
| + odd_label);
|
| + return;
|
| + }
|
| + }
|
| + }
|
| + UNREACHABLE();
|
| + }
|
| +
|
| + // If there are a lot of intervals in the regexp, then we will use tables to
|
| + // determine whether the character is inside or outside the character class.
|
| + static const int kBits = RegExpMacroAssembler::kTableSizeBits;
|
| + static const int kSize = RegExpMacroAssembler::kTableSize;
|
| + static const int kMask = RegExpMacroAssembler::kTableMask;
|
| +
|
| + if ((max_char >> kBits) == (min_char >> kBits)) {
|
| + EmitUseLookupTable(masm,
|
| + ranges2,
|
| + start_index,
|
| + end_index,
|
| + min_char,
|
| + fall_through,
|
| + even_label,
|
| + odd_label);
|
| + } else {
|
| + if ((min_char >> kBits) != (first >> kBits)) {
|
| + masm->CheckCharacterLT(first, odd_label);
|
| + GenerateBranches(masm,
|
| + ranges2,
|
| + start_index + 1,
|
| + end_index,
|
| + first,
|
| + max_char,
|
| + fall_through,
|
| + odd_label,
|
| + even_label);
|
| + return;
|
| + }
|
| +
|
| + // Unicode case. Split the search space into kSize spaces that are handled
|
| + // with recursion.
|
| + int new_start_index = start_index;
|
| + int border = (first & ~kMask) + kSize;
|
| + while (new_start_index <= end_index) {
|
| + if (ranges2->at(new_start_index) > border) break;
|
| + new_start_index++;
|
| + }
|
| + // new_start_index is the index of the first edge that is beyond the
|
| + // current kSize space.
|
| +
|
| + // For very large search spaces we do a binary chop search of the non-ASCII
|
| + // space instead of just going to the end of the current kSize space. The
|
| + // heuristics are complicated a little by the fact that any 128-character
|
| + // encoding space can be quickly tested with a table lookup, so we don't
|
| + // wish to do binary chop search at a smaller granularity than that. A
|
| + // 128-character space can take up a lot of space in the ranges2 array if,
|
| + // for example, we only want to match every second character (eg. the lower
|
| + // case characters on some Unicode pages).
|
| + int binary_chop_index = (end_index + start_index) / 2;
|
| + if (border - 1 > String::kMaxAsciiCharCode &&
|
| + end_index - start_index > (new_start_index - start_index) * 4 &&
|
| + last - first > kSize * 4 &&
|
| + binary_chop_index > new_start_index &&
|
| + ranges2->at(binary_chop_index) > first + 2 * kSize) {
|
| + int scan_forward_for_section_border = binary_chop_index;;
|
| + int new_border = (ranges2->at(binary_chop_index) | kMask) + 1;
|
| + while (scan_forward_for_section_border < end_index) {
|
| + if (ranges2->at(scan_forward_for_section_border) > new_border) {
|
| + new_start_index = scan_forward_for_section_border;
|
| + border = new_border;
|
| + break;
|
| + }
|
| + scan_forward_for_section_border++;
|
| + }
|
| + }
|
| +
|
| + ASSERT(new_start_index > start_index);
|
| + Label handle_rest;
|
| + Label* above = &handle_rest;
|
| + int new_end_index = new_start_index - 1;
|
| + if (new_start_index > end_index) {
|
| + // We didn't find any section that started after the limit, so everything
|
| + // above the border is one of the terminal labels.
|
| + above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
|
| + }
|
| + if (ranges2->at(new_end_index) == border) {
|
| + new_end_index--;
|
| + }
|
| + masm->CheckCharacterGT(border - 1, above);
|
| + Label dummy;
|
| + GenerateBranches(masm,
|
| + ranges2,
|
| + start_index,
|
| + new_end_index,
|
| + min_char,
|
| + border - 1,
|
| + &dummy,
|
| + even_label,
|
| + odd_label);
|
| + if (handle_rest.is_linked()) {
|
| + masm->Bind(&handle_rest);
|
| + bool flip = (new_start_index & 1) != (start_index & 1);
|
| + GenerateBranches(masm,
|
| + ranges2,
|
| + new_start_index,
|
| + end_index,
|
| + border,
|
| + max_char,
|
| + &dummy,
|
| + flip ? odd_label : even_label,
|
| + flip ? even_label : odd_label);
|
| + }
|
| + }
|
| +}
|
| +
|
| +
|
| static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
|
| RegExpCharacterClass* cc,
|
| bool ascii,
|
| @@ -1542,6 +1812,10 @@
|
| bool check_offset,
|
| bool preloaded) {
|
| ZoneList<CharacterRange>* ranges = cc->ranges();
|
| + if (!CharacterRange::IsCanonical(ranges)) {
|
| + CharacterRange::Canonicalize(ranges);
|
| + }
|
| +
|
| int max_char;
|
| if (ascii) {
|
| max_char = String::kMaxAsciiCharCode;
|
| @@ -1549,11 +1823,6 @@
|
| max_char = String::kMaxUtf16CodeUnit;
|
| }
|
|
|
| - Label success;
|
| -
|
| - Label* char_is_in_class =
|
| - cc->is_negated() ? on_failure : &success;
|
| -
|
| int range_count = ranges->length();
|
|
|
| int last_valid_range = range_count - 1;
|
| @@ -1567,8 +1836,6 @@
|
|
|
| if (last_valid_range < 0) {
|
| if (!cc->is_negated()) {
|
| - // TODO(plesner): We can remove this when the node level does our
|
| - // ASCII optimizations for us.
|
| macro_assembler->GoTo(on_failure);
|
| }
|
| if (check_offset) {
|
| @@ -1578,6 +1845,18 @@
|
| }
|
|
|
| if (last_valid_range == 0 &&
|
| + ranges->at(0).IsEverything(max_char)) {
|
| + if (cc->is_negated()) {
|
| + macro_assembler->GoTo(on_failure);
|
| + } else {
|
| + // This is a common case hit by non-anchored expressions.
|
| + if (check_offset) {
|
| + macro_assembler->CheckPosition(cp_offset, on_failure);
|
| + }
|
| + }
|
| + return;
|
| + }
|
| + if (last_valid_range == 0 &&
|
| !cc->is_negated() &&
|
| ranges->at(0).IsEverything(max_char)) {
|
| // This is a common case hit by non-anchored expressions.
|
| @@ -1597,64 +1876,45 @@
|
| return;
|
| }
|
|
|
| - for (int i = 0; i < last_valid_range; i++) {
|
| - CharacterRange& range = ranges->at(i);
|
| - Label next_range;
|
| - uc16 from = range.from();
|
| - uc16 to = range.to();
|
| - if (from > max_char) {
|
| - continue;
|
| - }
|
| - if (to > max_char) to = max_char;
|
| - if (to == from) {
|
| - macro_assembler->CheckCharacter(to, char_is_in_class);
|
| - } else {
|
| - if (from != 0) {
|
| - macro_assembler->CheckCharacterLT(from, &next_range);
|
| - }
|
| - if (to != max_char) {
|
| - macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
|
| - } else {
|
| - macro_assembler->GoTo(char_is_in_class);
|
| - }
|
| - }
|
| - macro_assembler->Bind(&next_range);
|
| - }
|
|
|
| - CharacterRange& range = ranges->at(last_valid_range);
|
| - uc16 from = range.from();
|
| - uc16 to = range.to();
|
| + // A new list with ascending entries. Each entry is a code unit
|
| + // where there is a boundary between code units that are part of
|
| + // the class and code units that are not. Normally we insert an
|
| + // entry at zero which goes to the failure label, but if there
|
| + // was already one there we fall through for success on that entry.
|
| + // Subsequent entries have alternating meaning (success/failure).
|
| + ZoneList<int>* ranges2 =
|
| + new ZoneList<int>(last_valid_range);
|
|
|
| - if (to > max_char) to = max_char;
|
| - ASSERT(to >= from);
|
| + bool zeroth_entry_is_failure = true;
|
| + if (cc->is_negated()) zeroth_entry_is_failure = false;
|
|
|
| - if (to == from) {
|
| - if (cc->is_negated()) {
|
| - macro_assembler->CheckCharacter(to, on_failure);
|
| - } else {
|
| - macro_assembler->CheckNotCharacter(to, on_failure);
|
| - }
|
| + int start = ranges->at(0).from();
|
| + if (start == 0) {
|
| + zeroth_entry_is_failure = !zeroth_entry_is_failure;
|
| } else {
|
| - if (from != 0) {
|
| - if (cc->is_negated()) {
|
| - macro_assembler->CheckCharacterLT(from, &success);
|
| - } else {
|
| - macro_assembler->CheckCharacterLT(from, on_failure);
|
| - }
|
| - }
|
| - if (to != String::kMaxUtf16CodeUnit) {
|
| - if (cc->is_negated()) {
|
| - macro_assembler->CheckCharacterLT(to + 1, on_failure);
|
| - } else {
|
| - macro_assembler->CheckCharacterGT(to, on_failure);
|
| - }
|
| - } else {
|
| - if (cc->is_negated()) {
|
| - macro_assembler->GoTo(on_failure);
|
| - }
|
| - }
|
| + ranges2->Add(0);
|
| }
|
| - macro_assembler->Bind(&success);
|
| +
|
| + for (int i = 0; i <= last_valid_range; i++) {
|
| + CharacterRange& range = ranges->at(i);
|
| + ranges2->Add(range.from());
|
| + ranges2->Add(range.to() + 1);
|
| + }
|
| + int ranges2_length = ranges2->length() - 1;
|
| + if (ranges2->at(ranges2_length) >= max_char) ranges2_length--;
|
| +
|
| + Label fall_through;
|
| + GenerateBranches(macro_assembler,
|
| + ranges2,
|
| + 1,
|
| + ranges2_length,
|
| + 0,
|
| + max_char,
|
| + &fall_through,
|
| + zeroth_entry_is_failure ? &fall_through : on_failure,
|
| + zeroth_entry_is_failure ? on_failure : &fall_through);
|
| + macro_assembler->Bind(&fall_through);
|
| }
|
|
|
|
|
|
|