src/jsregexp.cc - Issue 378024: * Fix regexp benchmark regression where we were doing work to...

Unified Diff: src/jsregexp.cc

Issue 378024: * Fix regexp benchmark regression where we were doing work to... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 11 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/jsregexp.cc

===================================================================

--- src/jsregexp.cc (revision 3237)

+++ src/jsregexp.cc (working copy)

@@ -2432,16 +2432,19 @@

}

-void TextNode::MakeCaseIndependent() {

+void TextNode::MakeCaseIndependent(bool is_ascii) {

int element_count = elms_->length();

for (int i = 0; i < element_count; i++) {

TextElement elm = elms_->at(i);

if (elm.type == TextElement::CHAR_CLASS) {

RegExpCharacterClass* cc = elm.data.u_char_class;

+ // None of the standard regexps is different in the case independent case

Christian Plesner Hansen 2009/11/09 09:56:52 s/regexps is/character classes are/

+ // and it slows us down if we don't know that.

+ if (cc->is_standard()) continue;

ZoneList<CharacterRange>* ranges = cc->ranges();

int range_count = ranges->length();

for (int j = 0; j < range_count; j++) {

- ranges->at(j).AddCaseEquivalents(ranges);

+ ranges->at(j).AddCaseEquivalents(ranges, is_ascii);

}

@@ -3912,19 +3915,31 @@

}

-void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {

+static void AddUncanonicals(ZoneList<CharacterRange>* ranges,

+ int bottom,

+ int top);

+void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,

+ bool is_ascii) {

+ uc16 bottom = from();

+ uc16 top = to();

+ if (is_ascii) {

+ if (bottom > String::kMaxAsciiCharCode) return;

+ if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;

+ }

unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

- if (IsSingleton()) {

+ if (top == bottom) {

// If this is a singleton we just expand the one character.

- int length = uncanonicalize.get(from(), '\0', chars);

+ int length = uncanonicalize.get(bottom, '\0', chars);

for (int i = 0; i < length; i++) {

uc32 chr = chars[i];

- if (chr != from()) {

+ if (chr != bottom) {

ranges->Add(CharacterRange::Singleton(chars[i]));

}

- } else if (from() <= kRangeCanonicalizeMax &&

- to() <= kRangeCanonicalizeMax) {

+ } else if (bottom <= kRangeCanonicalizeMax &&

+ top <= kRangeCanonicalizeMax) {

// If this is a range we expand the characters block by block,

// expanding contiguous subranges (blocks) one at a time.

// The approach is as follows. For a given start character we

@@ -3943,14 +3958,14 @@

// completely contained in a block we do this for all the blocks

// covered by the range.

unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];

- // First, look up the block that contains the 'from' character.

- int length = canonrange.get(from(), '\0', range);

+ // First, look up the block that contains the 'bottom' character.

+ int length = canonrange.get(bottom, '\0', range);

if (length == 0) {

- range[0] = from();

+ range[0] = bottom;

} else {

ASSERT_EQ(1, length);

}

- int pos = from();

+ int pos = bottom;

// The start of the current block. Note that except for the first

// iteration 'start' is always equal to 'pos'.

int start;

@@ -3964,7 +3979,7 @@

// Then we add the ranges one at a time, incrementing the current

// position to be after the last block each time. The position

// always points to the start of a block.

- while (pos < to()) {

+ while (pos < top) {

length = canonrange.get(start, '\0', range);

if (length == 0) {

range[0] = start;

@@ -3975,57 +3990,120 @@

// The start point of a block contains the distance to the end

// of the range.

int block_end = start + (range[0] & kPayloadMask) - 1;

- int end = (block_end > to()) ? to() : block_end;

+ int end = (block_end > top) ? top : block_end;

length = uncanonicalize.get(start, '\0', range);

for (int i = 0; i < length; i++) {

uc32 c = range[i];

uc16 range_from = c + (pos - start);

uc16 range_to = c + (end - start);

- if (!(from() <= range_from && range_to <= to())) {

+ if (!(bottom <= range_from && range_to <= top)) {

ranges->Add(CharacterRange(range_from, range_to));

}

start = pos = block_end + 1;

}

- } else if (from() > 0 || to() < String::kMaxUC16CharCode) {

+ } else {

// Unibrow ranges don't work for high characters due to the "2^11 bug".

- // Therefore we do something dumber for these ranges. We don't bother

- // if the range is 0-max (as encountered at the start of an unanchored

- // regexp).

- ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);

- int bottom = from();

- int top = to();

- for (int i = bottom; i <= top; i++) {

- int length = uncanonicalize.get(i, '\0', chars);

- for (int j = 0; j < length; j++) {

- uc32 chr = chars[j];

- if (chr != i && chr < bottom || chr > top) {

- characters->Add(chr);

+ // Therefore we do something dumber for these ranges.

+ AddUncanonicals(ranges, bottom, top);

+ }

+static void AddUncanonicals(ZoneList<CharacterRange>* ranges,

+ int bottom,

+ int top) {

+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

+ // Zones with no case mappings:

+ // 0x0600 - 0x0fff

+ // 0x1100 - 0x1cff

+ // 0x2000 - 0x20ff

+ // 0x2200 - 0x23ff

+ // 0x2500 - 0x2bff

+ // 0x2e00 - 0xa5ff

+ // 0xa800 - 0xfaff

+ // 0xfc00 - 0xfeff

+ const int boundary_count = 18;

+ // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this

+ // array. This is to split up big ranges and not because they actually denote

+ // a case-mapping-free-zone.

+ ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600);

+ const int kFirstRealCaselessZoneIndex = 2;

+ int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax,

+ 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500,

+ 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};

+ // Special ASCII rule from spec can save us some work here.

+ if (bottom == 0x80 && top == 0xffff) return;

+ // We have optimized support for this range.

+ if (top <= CharacterRange::kRangeCanonicalizeMax) {

+ CharacterRange range(bottom, top);

+ range.AddCaseEquivalents(ranges, false);

+ return;

+ }

+ // Split up very large ranges.

+ for (int i = 0; i < boundary_count; i++) {

+ if (bottom < boundaries[i] && top >= boundaries[i]) {

+ AddUncanonicals(ranges, bottom, boundaries[i] - 1);

+ AddUncanonicals(ranges, boundaries[i], top);

+ return;

+ }

+ // If we are completely in a zone with no case mappings then we are done.

+ // We start at 2 so as not to except the ASCII range from mappings.

+ for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) {

+ if (bottom >= boundaries[i] && top < boundaries[i + 1]) {

+#ifdef DEBUG

+ for (int j = bottom; j <= top; j++) {

+ unsigned current_char = j;

+ int length = uncanonicalize.get(current_char, '\0', chars);

+ for (int k = 0; k < length; k++) {

+ ASSERT(chars[k] == current_char);

}

+#endif

+ return;

}

- if (characters->length() > 0) {

- int new_from = characters->at(0);

- int new_to = new_from;

- for (int i = 1; i < characters->length(); i++) {

- int chr = characters->at(i);

- if (chr == new_to + 1) {

- new_to++;

+ }

+ // Step through the range finding equivalent characters.

+ ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);

+ for (int i = bottom; i <= top; i++) {

+ int length = uncanonicalize.get(i, '\0', chars);

+ for (int j = 0; j < length; j++) {

+ uc32 chr = chars[j];

+ if (chr != i && chr < bottom || chr > top) {

+ characters->Add(chr);

+ }

+ // Step through the equivalent characters finding simple ranges and

+ // adding ranges to the character class.

+ if (characters->length() > 0) {

+ int new_from = characters->at(0);

+ int new_to = new_from;

+ for (int i = 1; i < characters->length(); i++) {

+ int chr = characters->at(i);

+ if (chr == new_to + 1) {

+ new_to++;

+ } else {

+ if (new_to == new_from) {

+ ranges->Add(CharacterRange::Singleton(new_from));

} else {

- if (new_to == new_from) {

- ranges->Add(CharacterRange::Singleton(new_from));

- } else {

- ranges->Add(CharacterRange(new_from, new_to));

- }

- new_from = new_to = chr;

+ ranges->Add(CharacterRange(new_from, new_to));

}

+ new_from = new_to = chr;

}

- if (new_to == new_from) {

- ranges->Add(CharacterRange::Singleton(new_from));

- } else {

- ranges->Add(CharacterRange(new_from, new_to));

- }

}

+ if (new_to == new_from) {

+ ranges->Add(CharacterRange::Singleton(new_from));

+ } else {

+ ranges->Add(CharacterRange(new_from, new_to));

+ }

}

@@ -4271,7 +4349,7 @@

void Analysis::VisitText(TextNode* that) {

if (ignore_case_) {

- that->MakeCaseIndependent();

+ that->MakeCaseIndependent(is_ascii_);

}

EnsureAnalyzed(that->on_success());

if (!has_failed()) {

@@ -4489,7 +4567,7 @@

}

data->node = node;

- Analysis analysis(ignore_case);

+ Analysis analysis(ignore_case, is_ascii);

analysis.EnsureAnalyzed(node);

if (analysis.has_failed()) {

const char* error_message = analysis.error_message();

« no previous file with comments | « src/jsregexp.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »