src/jsregexp.cc - Issue 3030026: Updated unicode library.

Unified Diff: src/jsregexp.cc

Issue 3030026: Updated unicode library. (Closed)

Patch Set: Removed outdated comments. Created 10 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/jsregexp.cc

diff --git a/src/jsregexp.cc b/src/jsregexp.cc

index 9f98782bbc19a375fec4a903040711a7b19b3760..c9afc62e156dfbfaaa06b66ee2e8a530545ad6f3 100644

--- a/src/jsregexp.cc

+++ b/src/jsregexp.cc

@@ -1272,7 +1272,7 @@ static int GetCaseIndependentLetters(uc16 character,

bool ascii_subject,

unibrow::uchar* letters) {

int length = uncanonicalize.get(character, '\0', letters);

- // Unibrow returns 0 or 1 for characters where case independependence is

+ // Unibrow returns 0 or 1 for characters where case independence is

// trivial.

if (length == 0) {

letters[0] = character;

@@ -4026,74 +4026,48 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,

ranges->Add(CharacterRange::Singleton(chars[i]));

}

- } else if (bottom <= kRangeCanonicalizeMax &&

- top <= kRangeCanonicalizeMax) {

+ } else {

// If this is a range we expand the characters block by block,

// expanding contiguous subranges (blocks) one at a time.

// The approach is as follows. For a given start character we

- // look up the block that contains it, for instance 'a' if the

- // start character is 'c'. A block is characterized by the property

- // that all characters uncanonicalize in the same way as the first

- // element, except that each entry in the result is incremented

- // by the distance from the first element. So a-z is a block

- // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter

- // uncanonicalizes to ['a' + k, 'A' + k].

- // Once we've found the start point we look up its uncanonicalization

+ // look up the remainder of the block that contains it (represented

+ // by the end point), for instance we find 'z' if the character

+ // is 'c'. A block is characterized by the property

+ // that all characters uncanonicalize in the same way, except that

+ // each entry in the result is incremented by the distance from the first

+ // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and

+ // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].

+ // Once we've found the end point we look up its uncanonicalization

// and produce a range for each element. For instance for [c-f]

- // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only

+ // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only

// add a range if it is not already contained in the input, so [c-f]

// will be skipped but [C-F] will be added. If this range is not

// completely contained in a block we do this for all the blocks

- // covered by the range.

+ // covered by the range (handling characters that is not in a block

+ // as a "singleton block").

unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];

- // First, look up the block that contains the 'bottom' character.

- int length = canonrange.get(bottom, '\0', range);

- if (length == 0) {

- range[0] = bottom;

- } else {

- ASSERT_EQ(1, length);

- }

int pos = bottom;

- // The start of the current block. Note that except for the first

- // iteration 'start' is always equal to 'pos'.

- int start;

- // If it is not the start point of a block the entry contains the

- // offset of the character from the start point.

- if ((range[0] & kStartMarker) == 0) {

- start = pos - range[0];

- } else {

- start = pos;

- }

- // Then we add the ranges one at a time, incrementing the current

- // position to be after the last block each time. The position

- // always points to the start of a block.

while (pos < top) {

- length = canonrange.get(start, '\0', range);

+ int length = canonrange.get(pos, '\0', range);

+ uc16 block_end;

if (length == 0) {

- range[0] = start;

+ block_end = pos;

} else {

ASSERT_EQ(1, length);

+ block_end = range[0];

}

- ASSERT((range[0] & kStartMarker) != 0);

- // The start point of a block contains the distance to the end

- // of the range.

- int block_end = start + (range[0] & kPayloadMask) - 1;

int end = (block_end > top) ? top : block_end;

- length = uncanonicalize.get(start, '\0', range);

+ length = uncanonicalize.get(block_end, '\0', range);

for (int i = 0; i < length; i++) {

uc32 c = range[i];

- uc16 range_from = c + (pos - start);

- uc16 range_to = c + (end - start);

+ uc16 range_from = c - (block_end - pos);

+ uc16 range_to = c - (block_end - end);

if (!(bottom <= range_from && range_to <= top)) {

ranges->Add(CharacterRange(range_from, range_to));

}

- start = pos = block_end + 1;

+ pos = end + 1;

}

- } else {

- // Unibrow ranges don't work for high characters due to the "2^11 bug".

- // Therefore we do something dumber for these ranges.

- AddUncanonicals(ranges, bottom, top);

}

@@ -4208,20 +4182,14 @@ static void AddUncanonicals(ZoneList<CharacterRange>* ranges,

// 0xa800 - 0xfaff

// 0xfc00 - 0xfeff

const int boundary_count = 18;

- // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this

- // array. This is to split up big ranges and not because they actually denote

- // a case-mapping-free-zone.

- ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600);

- const int kFirstRealCaselessZoneIndex = 2;

- int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax,

+ int boundaries[] = {

0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500,

0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};

// Special ASCII rule from spec can save us some work here.

if (bottom == 0x80 && top == 0xffff) return;

- // We have optimized support for this range.

- if (top <= CharacterRange::kRangeCanonicalizeMax) {

+ if (top <= boundaries[0]) {

CharacterRange range(bottom, top);

range.AddCaseEquivalents(ranges, false);

return;

@@ -4238,8 +4206,7 @@ static void AddUncanonicals(ZoneList<CharacterRange>* ranges,

}

// If we are completely in a zone with no case mappings then we are done.

- // We start at 2 so as not to except the ASCII range from mappings.

- for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) {

+ for (int i = 0; i < boundary_count; i += 2) {

if (bottom >= boundaries[i] && top < boundaries[i + 1]) {

#ifdef DEBUG

for (int j = bottom; j <= top; j++) {

« no previous file with comments | « src/jsregexp.h ('k') | src/unicode.h » ('j') | no next file with comments »