Chromium Code Reviews| Index: src/jsregexp.cc |
| =================================================================== |
| --- src/jsregexp.cc (revision 3237) |
| +++ src/jsregexp.cc (working copy) |
| @@ -2432,16 +2432,19 @@ |
| } |
| -void TextNode::MakeCaseIndependent() { |
| +void TextNode::MakeCaseIndependent(bool is_ascii) { |
| int element_count = elms_->length(); |
| for (int i = 0; i < element_count; i++) { |
| TextElement elm = elms_->at(i); |
| if (elm.type == TextElement::CHAR_CLASS) { |
| RegExpCharacterClass* cc = elm.data.u_char_class; |
| + // None of the standard regexps is different in the case independent case |
|
Christian Plesner Hansen
2009/11/09 09:56:52
s/regexps is/character classes are/
|
| + // and it slows us down if we don't know that. |
| + if (cc->is_standard()) continue; |
| ZoneList<CharacterRange>* ranges = cc->ranges(); |
| int range_count = ranges->length(); |
| for (int j = 0; j < range_count; j++) { |
| - ranges->at(j).AddCaseEquivalents(ranges); |
| + ranges->at(j).AddCaseEquivalents(ranges, is_ascii); |
| } |
| } |
| } |
| @@ -3912,19 +3915,31 @@ |
| } |
| -void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) { |
| +static void AddUncanonicals(ZoneList<CharacterRange>* ranges, |
| + int bottom, |
| + int top); |
| + |
| + |
| +void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, |
| + bool is_ascii) { |
| + uc16 bottom = from(); |
| + uc16 top = to(); |
| + if (is_ascii) { |
| + if (bottom > String::kMaxAsciiCharCode) return; |
| + if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode; |
| + } |
| unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| - if (IsSingleton()) { |
| + if (top == bottom) { |
| // If this is a singleton we just expand the one character. |
| - int length = uncanonicalize.get(from(), '\0', chars); |
| + int length = uncanonicalize.get(bottom, '\0', chars); |
| for (int i = 0; i < length; i++) { |
| uc32 chr = chars[i]; |
| - if (chr != from()) { |
| + if (chr != bottom) { |
| ranges->Add(CharacterRange::Singleton(chars[i])); |
| } |
| } |
| - } else if (from() <= kRangeCanonicalizeMax && |
| - to() <= kRangeCanonicalizeMax) { |
| + } else if (bottom <= kRangeCanonicalizeMax && |
| + top <= kRangeCanonicalizeMax) { |
| // If this is a range we expand the characters block by block, |
| // expanding contiguous subranges (blocks) one at a time. |
| // The approach is as follows. For a given start character we |
| @@ -3943,14 +3958,14 @@ |
| // completely contained in a block we do this for all the blocks |
| // covered by the range. |
| unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| - // First, look up the block that contains the 'from' character. |
| - int length = canonrange.get(from(), '\0', range); |
| + // First, look up the block that contains the 'bottom' character. |
| + int length = canonrange.get(bottom, '\0', range); |
| if (length == 0) { |
| - range[0] = from(); |
| + range[0] = bottom; |
| } else { |
| ASSERT_EQ(1, length); |
| } |
| - int pos = from(); |
| + int pos = bottom; |
| // The start of the current block. Note that except for the first |
| // iteration 'start' is always equal to 'pos'. |
| int start; |
| @@ -3964,7 +3979,7 @@ |
| // Then we add the ranges one at a time, incrementing the current |
| // position to be after the last block each time. The position |
| // always points to the start of a block. |
| - while (pos < to()) { |
| + while (pos < top) { |
| length = canonrange.get(start, '\0', range); |
| if (length == 0) { |
| range[0] = start; |
| @@ -3975,57 +3990,120 @@ |
| // The start point of a block contains the distance to the end |
| // of the range. |
| int block_end = start + (range[0] & kPayloadMask) - 1; |
| - int end = (block_end > to()) ? to() : block_end; |
| + int end = (block_end > top) ? top : block_end; |
| length = uncanonicalize.get(start, '\0', range); |
| for (int i = 0; i < length; i++) { |
| uc32 c = range[i]; |
| uc16 range_from = c + (pos - start); |
| uc16 range_to = c + (end - start); |
| - if (!(from() <= range_from && range_to <= to())) { |
| + if (!(bottom <= range_from && range_to <= top)) { |
| ranges->Add(CharacterRange(range_from, range_to)); |
| } |
| } |
| start = pos = block_end + 1; |
| } |
| - } else if (from() > 0 || to() < String::kMaxUC16CharCode) { |
| + } else { |
| // Unibrow ranges don't work for high characters due to the "2^11 bug". |
| - // Therefore we do something dumber for these ranges. We don't bother |
| - // if the range is 0-max (as encountered at the start of an unanchored |
| - // regexp). |
| - ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100); |
| - int bottom = from(); |
| - int top = to(); |
| - for (int i = bottom; i <= top; i++) { |
| - int length = uncanonicalize.get(i, '\0', chars); |
| - for (int j = 0; j < length; j++) { |
| - uc32 chr = chars[j]; |
| - if (chr != i && chr < bottom || chr > top) { |
| - characters->Add(chr); |
| + // Therefore we do something dumber for these ranges. |
| + AddUncanonicals(ranges, bottom, top); |
| + } |
| +} |
| + |
| + |
| +static void AddUncanonicals(ZoneList<CharacterRange>* ranges, |
| + int bottom, |
| + int top) { |
| + unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| + // Zones with no case mappings: |
| + // 0x0600 - 0x0fff |
| + // 0x1100 - 0x1cff |
| + // 0x2000 - 0x20ff |
| + // 0x2200 - 0x23ff |
| + // 0x2500 - 0x2bff |
| + // 0x2e00 - 0xa5ff |
| + // 0xa800 - 0xfaff |
| + // 0xfc00 - 0xfeff |
| + const int boundary_count = 18; |
| + // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this |
| + // array. This is to split up big ranges and not because they actually denote |
| + // a case-mapping-free-zone. |
| + ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600); |
| + const int kFirstRealCaselessZoneIndex = 2; |
| + int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax, |
| + 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500, |
| + 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00}; |
| + |
| + // Special ASCII rule from spec can save us some work here. |
| + if (bottom == 0x80 && top == 0xffff) return; |
| + |
| + // We have optimized support for this range. |
| + if (top <= CharacterRange::kRangeCanonicalizeMax) { |
| + CharacterRange range(bottom, top); |
| + range.AddCaseEquivalents(ranges, false); |
| + return; |
| + } |
| + |
| + // Split up very large ranges. |
| + for (int i = 0; i < boundary_count; i++) { |
| + if (bottom < boundaries[i] && top >= boundaries[i]) { |
| + AddUncanonicals(ranges, bottom, boundaries[i] - 1); |
| + AddUncanonicals(ranges, boundaries[i], top); |
| + return; |
| + } |
| + } |
| + |
| + // If we are completely in a zone with no case mappings then we are done. |
| + // We start at 2 so as not to except the ASCII range from mappings. |
| + for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) { |
| + if (bottom >= boundaries[i] && top < boundaries[i + 1]) { |
| +#ifdef DEBUG |
| + for (int j = bottom; j <= top; j++) { |
| + unsigned current_char = j; |
| + int length = uncanonicalize.get(current_char, '\0', chars); |
| + for (int k = 0; k < length; k++) { |
| + ASSERT(chars[k] == current_char); |
| } |
| } |
| +#endif |
| + return; |
| } |
| - if (characters->length() > 0) { |
| - int new_from = characters->at(0); |
| - int new_to = new_from; |
| - for (int i = 1; i < characters->length(); i++) { |
| - int chr = characters->at(i); |
| - if (chr == new_to + 1) { |
| - new_to++; |
| + } |
| + |
| + // Step through the range finding equivalent characters. |
| + ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100); |
| + for (int i = bottom; i <= top; i++) { |
| + int length = uncanonicalize.get(i, '\0', chars); |
| + for (int j = 0; j < length; j++) { |
| + uc32 chr = chars[j]; |
| + if (chr != i && chr < bottom || chr > top) { |
| + characters->Add(chr); |
| + } |
| + } |
| + } |
| + |
| + // Step through the equivalent characters finding simple ranges and |
| + // adding ranges to the character class. |
| + if (characters->length() > 0) { |
| + int new_from = characters->at(0); |
| + int new_to = new_from; |
| + for (int i = 1; i < characters->length(); i++) { |
| + int chr = characters->at(i); |
| + if (chr == new_to + 1) { |
| + new_to++; |
| + } else { |
| + if (new_to == new_from) { |
| + ranges->Add(CharacterRange::Singleton(new_from)); |
| } else { |
| - if (new_to == new_from) { |
| - ranges->Add(CharacterRange::Singleton(new_from)); |
| - } else { |
| - ranges->Add(CharacterRange(new_from, new_to)); |
| - } |
| - new_from = new_to = chr; |
| + ranges->Add(CharacterRange(new_from, new_to)); |
| } |
| + new_from = new_to = chr; |
| } |
| - if (new_to == new_from) { |
| - ranges->Add(CharacterRange::Singleton(new_from)); |
| - } else { |
| - ranges->Add(CharacterRange(new_from, new_to)); |
| - } |
| } |
| + if (new_to == new_from) { |
| + ranges->Add(CharacterRange::Singleton(new_from)); |
| + } else { |
| + ranges->Add(CharacterRange(new_from, new_to)); |
| + } |
| } |
| } |
| @@ -4271,7 +4349,7 @@ |
| void Analysis::VisitText(TextNode* that) { |
| if (ignore_case_) { |
| - that->MakeCaseIndependent(); |
| + that->MakeCaseIndependent(is_ascii_); |
| } |
| EnsureAnalyzed(that->on_success()); |
| if (!has_failed()) { |
| @@ -4489,7 +4567,7 @@ |
| } |
| } |
| data->node = node; |
| - Analysis analysis(ignore_case); |
| + Analysis analysis(ignore_case, is_ascii); |
| analysis.EnsureAnalyzed(node); |
| if (analysis.has_failed()) { |
| const char* error_message = analysis.error_message(); |