Index: src/jsregexp.cc |
diff --git a/src/jsregexp.cc b/src/jsregexp.cc |
index 9f98782bbc19a375fec4a903040711a7b19b3760..c9afc62e156dfbfaaa06b66ee2e8a530545ad6f3 100644 |
--- a/src/jsregexp.cc |
+++ b/src/jsregexp.cc |
@@ -1272,7 +1272,7 @@ static int GetCaseIndependentLetters(uc16 character, |
bool ascii_subject, |
unibrow::uchar* letters) { |
int length = uncanonicalize.get(character, '\0', letters); |
- // Unibrow returns 0 or 1 for characters where case independependence is |
+ // Unibrow returns 0 or 1 for characters where case independence is |
// trivial. |
if (length == 0) { |
letters[0] = character; |
@@ -4026,74 +4026,48 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, |
ranges->Add(CharacterRange::Singleton(chars[i])); |
} |
} |
- } else if (bottom <= kRangeCanonicalizeMax && |
- top <= kRangeCanonicalizeMax) { |
+ } else { |
// If this is a range we expand the characters block by block, |
// expanding contiguous subranges (blocks) one at a time. |
// The approach is as follows. For a given start character we |
- // look up the block that contains it, for instance 'a' if the |
- // start character is 'c'. A block is characterized by the property |
- // that all characters uncanonicalize in the same way as the first |
- // element, except that each entry in the result is incremented |
- // by the distance from the first element. So a-z is a block |
- // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter |
- // uncanonicalizes to ['a' + k, 'A' + k]. |
- // Once we've found the start point we look up its uncanonicalization |
+ // look up the remainder of the block that contains it (represented |
+ // by the end point), for instance we find 'z' if the character |
+ // is 'c'. A block is characterized by the property |
+ // that all characters uncanonicalize in the same way, except that |
+ // each entry in the result is incremented by the distance from the first |
+ // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and |
+ // the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. |
+ // Once we've found the end point we look up its uncanonicalization |
// and produce a range for each element. For instance for [c-f] |
- // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only |
+ // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only |
// add a range if it is not already contained in the input, so [c-f] |
// will be skipped but [C-F] will be added. If this range is not |
// completely contained in a block we do this for all the blocks |
- // covered by the range. |
+ // covered by the range (handling characters that is not in a block |
+ // as a "singleton block"). |
unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
- // First, look up the block that contains the 'bottom' character. |
- int length = canonrange.get(bottom, '\0', range); |
- if (length == 0) { |
- range[0] = bottom; |
- } else { |
- ASSERT_EQ(1, length); |
- } |
int pos = bottom; |
- // The start of the current block. Note that except for the first |
- // iteration 'start' is always equal to 'pos'. |
- int start; |
- // If it is not the start point of a block the entry contains the |
- // offset of the character from the start point. |
- if ((range[0] & kStartMarker) == 0) { |
- start = pos - range[0]; |
- } else { |
- start = pos; |
- } |
- // Then we add the ranges one at a time, incrementing the current |
- // position to be after the last block each time. The position |
- // always points to the start of a block. |
while (pos < top) { |
- length = canonrange.get(start, '\0', range); |
+ int length = canonrange.get(pos, '\0', range); |
+ uc16 block_end; |
if (length == 0) { |
- range[0] = start; |
+ block_end = pos; |
} else { |
ASSERT_EQ(1, length); |
+ block_end = range[0]; |
} |
- ASSERT((range[0] & kStartMarker) != 0); |
- // The start point of a block contains the distance to the end |
- // of the range. |
- int block_end = start + (range[0] & kPayloadMask) - 1; |
int end = (block_end > top) ? top : block_end; |
- length = uncanonicalize.get(start, '\0', range); |
+ length = uncanonicalize.get(block_end, '\0', range); |
for (int i = 0; i < length; i++) { |
uc32 c = range[i]; |
- uc16 range_from = c + (pos - start); |
- uc16 range_to = c + (end - start); |
+ uc16 range_from = c - (block_end - pos); |
+ uc16 range_to = c - (block_end - end); |
if (!(bottom <= range_from && range_to <= top)) { |
ranges->Add(CharacterRange(range_from, range_to)); |
} |
} |
- start = pos = block_end + 1; |
+ pos = end + 1; |
} |
- } else { |
- // Unibrow ranges don't work for high characters due to the "2^11 bug". |
- // Therefore we do something dumber for these ranges. |
- AddUncanonicals(ranges, bottom, top); |
} |
} |
@@ -4208,20 +4182,14 @@ static void AddUncanonicals(ZoneList<CharacterRange>* ranges, |
// 0xa800 - 0xfaff |
// 0xfc00 - 0xfeff |
const int boundary_count = 18; |
- // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this |
- // array. This is to split up big ranges and not because they actually denote |
- // a case-mapping-free-zone. |
- ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600); |
- const int kFirstRealCaselessZoneIndex = 2; |
- int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax, |
+ int boundaries[] = { |
0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500, |
0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00}; |
// Special ASCII rule from spec can save us some work here. |
if (bottom == 0x80 && top == 0xffff) return; |
- // We have optimized support for this range. |
- if (top <= CharacterRange::kRangeCanonicalizeMax) { |
+ if (top <= boundaries[0]) { |
CharacterRange range(bottom, top); |
range.AddCaseEquivalents(ranges, false); |
return; |
@@ -4238,8 +4206,7 @@ static void AddUncanonicals(ZoneList<CharacterRange>* ranges, |
} |
// If we are completely in a zone with no case mappings then we are done. |
- // We start at 2 so as not to except the ASCII range from mappings. |
- for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) { |
+ for (int i = 0; i < boundary_count; i += 2) { |
if (bottom >= boundaries[i] && top < boundaries[i + 1]) { |
#ifdef DEBUG |
for (int j = bottom; j <= top; j++) { |