Index: src/jsregexp.cc |
=================================================================== |
--- src/jsregexp.cc (revision 3237) |
+++ src/jsregexp.cc (working copy) |
@@ -2432,16 +2432,19 @@ |
} |
-void TextNode::MakeCaseIndependent() { |
+void TextNode::MakeCaseIndependent(bool is_ascii) { |
int element_count = elms_->length(); |
for (int i = 0; i < element_count; i++) { |
TextElement elm = elms_->at(i); |
if (elm.type == TextElement::CHAR_CLASS) { |
RegExpCharacterClass* cc = elm.data.u_char_class; |
+ // None of the standard regexps is different in the case independent case |
Christian Plesner Hansen
2009/11/09 09:56:52
s/regexps is/character classes are/
|
+ // and it slows us down if we don't know that. |
+ if (cc->is_standard()) continue; |
ZoneList<CharacterRange>* ranges = cc->ranges(); |
int range_count = ranges->length(); |
for (int j = 0; j < range_count; j++) { |
- ranges->at(j).AddCaseEquivalents(ranges); |
+ ranges->at(j).AddCaseEquivalents(ranges, is_ascii); |
} |
} |
} |
@@ -3912,19 +3915,31 @@ |
} |
-void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) { |
+static void AddUncanonicals(ZoneList<CharacterRange>* ranges, |
+ int bottom, |
+ int top); |
+ |
+ |
+void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, |
+ bool is_ascii) { |
+ uc16 bottom = from(); |
+ uc16 top = to(); |
+ if (is_ascii) { |
+ if (bottom > String::kMaxAsciiCharCode) return; |
+ if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode; |
+ } |
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
- if (IsSingleton()) { |
+ if (top == bottom) { |
// If this is a singleton we just expand the one character. |
- int length = uncanonicalize.get(from(), '\0', chars); |
+ int length = uncanonicalize.get(bottom, '\0', chars); |
for (int i = 0; i < length; i++) { |
uc32 chr = chars[i]; |
- if (chr != from()) { |
+ if (chr != bottom) { |
ranges->Add(CharacterRange::Singleton(chars[i])); |
} |
} |
- } else if (from() <= kRangeCanonicalizeMax && |
- to() <= kRangeCanonicalizeMax) { |
+ } else if (bottom <= kRangeCanonicalizeMax && |
+ top <= kRangeCanonicalizeMax) { |
// If this is a range we expand the characters block by block, |
// expanding contiguous subranges (blocks) one at a time. |
// The approach is as follows. For a given start character we |
@@ -3943,14 +3958,14 @@ |
// completely contained in a block we do this for all the blocks |
// covered by the range. |
unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
- // First, look up the block that contains the 'from' character. |
- int length = canonrange.get(from(), '\0', range); |
+ // First, look up the block that contains the 'bottom' character. |
+ int length = canonrange.get(bottom, '\0', range); |
if (length == 0) { |
- range[0] = from(); |
+ range[0] = bottom; |
} else { |
ASSERT_EQ(1, length); |
} |
- int pos = from(); |
+ int pos = bottom; |
// The start of the current block. Note that except for the first |
// iteration 'start' is always equal to 'pos'. |
int start; |
@@ -3964,7 +3979,7 @@ |
// Then we add the ranges one at a time, incrementing the current |
// position to be after the last block each time. The position |
// always points to the start of a block. |
- while (pos < to()) { |
+ while (pos < top) { |
length = canonrange.get(start, '\0', range); |
if (length == 0) { |
range[0] = start; |
@@ -3975,57 +3990,120 @@ |
// The start point of a block contains the distance to the end |
// of the range. |
int block_end = start + (range[0] & kPayloadMask) - 1; |
- int end = (block_end > to()) ? to() : block_end; |
+ int end = (block_end > top) ? top : block_end; |
length = uncanonicalize.get(start, '\0', range); |
for (int i = 0; i < length; i++) { |
uc32 c = range[i]; |
uc16 range_from = c + (pos - start); |
uc16 range_to = c + (end - start); |
- if (!(from() <= range_from && range_to <= to())) { |
+ if (!(bottom <= range_from && range_to <= top)) { |
ranges->Add(CharacterRange(range_from, range_to)); |
} |
} |
start = pos = block_end + 1; |
} |
- } else if (from() > 0 || to() < String::kMaxUC16CharCode) { |
+ } else { |
// Unibrow ranges don't work for high characters due to the "2^11 bug". |
- // Therefore we do something dumber for these ranges. We don't bother |
- // if the range is 0-max (as encountered at the start of an unanchored |
- // regexp). |
- ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100); |
- int bottom = from(); |
- int top = to(); |
- for (int i = bottom; i <= top; i++) { |
- int length = uncanonicalize.get(i, '\0', chars); |
- for (int j = 0; j < length; j++) { |
- uc32 chr = chars[j]; |
- if (chr != i && chr < bottom || chr > top) { |
- characters->Add(chr); |
+ // Therefore we do something dumber for these ranges. |
+ AddUncanonicals(ranges, bottom, top); |
+ } |
+} |
+ |
+ |
+static void AddUncanonicals(ZoneList<CharacterRange>* ranges, |
+ int bottom, |
+ int top) { |
+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
+ // Zones with no case mappings: |
+ // 0x0600 - 0x0fff |
+ // 0x1100 - 0x1cff |
+ // 0x2000 - 0x20ff |
+ // 0x2200 - 0x23ff |
+ // 0x2500 - 0x2bff |
+ // 0x2e00 - 0xa5ff |
+ // 0xa800 - 0xfaff |
+ // 0xfc00 - 0xfeff |
+ const int boundary_count = 18; |
+ // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this |
+ // array. This is to split up big ranges and not because they actually denote |
+ // a case-mapping-free-zone. |
+ ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600); |
+ const int kFirstRealCaselessZoneIndex = 2; |
+ int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax, |
+ 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500, |
+ 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00}; |
+ |
+ // Special ASCII rule from spec can save us some work here. |
+ if (bottom == 0x80 && top == 0xffff) return; |
+ |
+ // We have optimized support for this range. |
+ if (top <= CharacterRange::kRangeCanonicalizeMax) { |
+ CharacterRange range(bottom, top); |
+ range.AddCaseEquivalents(ranges, false); |
+ return; |
+ } |
+ |
+ // Split up very large ranges. |
+ for (int i = 0; i < boundary_count; i++) { |
+ if (bottom < boundaries[i] && top >= boundaries[i]) { |
+ AddUncanonicals(ranges, bottom, boundaries[i] - 1); |
+ AddUncanonicals(ranges, boundaries[i], top); |
+ return; |
+ } |
+ } |
+ |
+ // If we are completely in a zone with no case mappings then we are done. |
+ // We start at 2 so as not to except the ASCII range from mappings. |
+ for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) { |
+ if (bottom >= boundaries[i] && top < boundaries[i + 1]) { |
+#ifdef DEBUG |
+ for (int j = bottom; j <= top; j++) { |
+ unsigned current_char = j; |
+ int length = uncanonicalize.get(current_char, '\0', chars); |
+ for (int k = 0; k < length; k++) { |
+ ASSERT(chars[k] == current_char); |
} |
} |
+#endif |
+ return; |
} |
- if (characters->length() > 0) { |
- int new_from = characters->at(0); |
- int new_to = new_from; |
- for (int i = 1; i < characters->length(); i++) { |
- int chr = characters->at(i); |
- if (chr == new_to + 1) { |
- new_to++; |
+ } |
+ |
+ // Step through the range finding equivalent characters. |
+ ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100); |
+ for (int i = bottom; i <= top; i++) { |
+ int length = uncanonicalize.get(i, '\0', chars); |
+ for (int j = 0; j < length; j++) { |
+ uc32 chr = chars[j]; |
+ if (chr != i && chr < bottom || chr > top) { |
+ characters->Add(chr); |
+ } |
+ } |
+ } |
+ |
+ // Step through the equivalent characters finding simple ranges and |
+ // adding ranges to the character class. |
+ if (characters->length() > 0) { |
+ int new_from = characters->at(0); |
+ int new_to = new_from; |
+ for (int i = 1; i < characters->length(); i++) { |
+ int chr = characters->at(i); |
+ if (chr == new_to + 1) { |
+ new_to++; |
+ } else { |
+ if (new_to == new_from) { |
+ ranges->Add(CharacterRange::Singleton(new_from)); |
} else { |
- if (new_to == new_from) { |
- ranges->Add(CharacterRange::Singleton(new_from)); |
- } else { |
- ranges->Add(CharacterRange(new_from, new_to)); |
- } |
- new_from = new_to = chr; |
+ ranges->Add(CharacterRange(new_from, new_to)); |
} |
+ new_from = new_to = chr; |
} |
- if (new_to == new_from) { |
- ranges->Add(CharacterRange::Singleton(new_from)); |
- } else { |
- ranges->Add(CharacterRange(new_from, new_to)); |
- } |
} |
+ if (new_to == new_from) { |
+ ranges->Add(CharacterRange::Singleton(new_from)); |
+ } else { |
+ ranges->Add(CharacterRange(new_from, new_to)); |
+ } |
} |
} |
@@ -4271,7 +4349,7 @@ |
void Analysis::VisitText(TextNode* that) { |
if (ignore_case_) { |
- that->MakeCaseIndependent(); |
+ that->MakeCaseIndependent(is_ascii_); |
} |
EnsureAnalyzed(that->on_success()); |
if (!has_failed()) { |
@@ -4489,7 +4567,7 @@ |
} |
} |
data->node = node; |
- Analysis analysis(ignore_case); |
+ Analysis analysis(ignore_case, is_ascii); |
analysis.EnsureAnalyzed(node); |
if (analysis.has_failed()) { |
const char* error_message = analysis.error_message(); |