Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(600)

Unified Diff: src/jsregexp.cc

Issue 378024: * Fix regexp benchmark regression where we were doing work to... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 11 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/jsregexp.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/jsregexp.cc
===================================================================
--- src/jsregexp.cc (revision 3237)
+++ src/jsregexp.cc (working copy)
@@ -2432,16 +2432,19 @@
}
-void TextNode::MakeCaseIndependent() {
+void TextNode::MakeCaseIndependent(bool is_ascii) {
int element_count = elms_->length();
for (int i = 0; i < element_count; i++) {
TextElement elm = elms_->at(i);
if (elm.type == TextElement::CHAR_CLASS) {
RegExpCharacterClass* cc = elm.data.u_char_class;
+ // None of the standard regexps is different in the case independent case
Christian Plesner Hansen 2009/11/09 09:56:52 s/regexps is/character classes are/
+ // and it slows us down if we don't know that.
+ if (cc->is_standard()) continue;
ZoneList<CharacterRange>* ranges = cc->ranges();
int range_count = ranges->length();
for (int j = 0; j < range_count; j++) {
- ranges->at(j).AddCaseEquivalents(ranges);
+ ranges->at(j).AddCaseEquivalents(ranges, is_ascii);
}
}
}
@@ -3912,19 +3915,31 @@
}
-void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
+static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
+ int bottom,
+ int top);
+
+
+void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
+ bool is_ascii) {
+ uc16 bottom = from();
+ uc16 top = to();
+ if (is_ascii) {
+ if (bottom > String::kMaxAsciiCharCode) return;
+ if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;
+ }
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
- if (IsSingleton()) {
+ if (top == bottom) {
// If this is a singleton we just expand the one character.
- int length = uncanonicalize.get(from(), '\0', chars);
+ int length = uncanonicalize.get(bottom, '\0', chars);
for (int i = 0; i < length; i++) {
uc32 chr = chars[i];
- if (chr != from()) {
+ if (chr != bottom) {
ranges->Add(CharacterRange::Singleton(chars[i]));
}
}
- } else if (from() <= kRangeCanonicalizeMax &&
- to() <= kRangeCanonicalizeMax) {
+ } else if (bottom <= kRangeCanonicalizeMax &&
+ top <= kRangeCanonicalizeMax) {
// If this is a range we expand the characters block by block,
// expanding contiguous subranges (blocks) one at a time.
// The approach is as follows. For a given start character we
@@ -3943,14 +3958,14 @@
// completely contained in a block we do this for all the blocks
// covered by the range.
unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
- // First, look up the block that contains the 'from' character.
- int length = canonrange.get(from(), '\0', range);
+ // First, look up the block that contains the 'bottom' character.
+ int length = canonrange.get(bottom, '\0', range);
if (length == 0) {
- range[0] = from();
+ range[0] = bottom;
} else {
ASSERT_EQ(1, length);
}
- int pos = from();
+ int pos = bottom;
// The start of the current block. Note that except for the first
// iteration 'start' is always equal to 'pos'.
int start;
@@ -3964,7 +3979,7 @@
// Then we add the ranges one at a time, incrementing the current
// position to be after the last block each time. The position
// always points to the start of a block.
- while (pos < to()) {
+ while (pos < top) {
length = canonrange.get(start, '\0', range);
if (length == 0) {
range[0] = start;
@@ -3975,57 +3990,120 @@
// The start point of a block contains the distance to the end
// of the range.
int block_end = start + (range[0] & kPayloadMask) - 1;
- int end = (block_end > to()) ? to() : block_end;
+ int end = (block_end > top) ? top : block_end;
length = uncanonicalize.get(start, '\0', range);
for (int i = 0; i < length; i++) {
uc32 c = range[i];
uc16 range_from = c + (pos - start);
uc16 range_to = c + (end - start);
- if (!(from() <= range_from && range_to <= to())) {
+ if (!(bottom <= range_from && range_to <= top)) {
ranges->Add(CharacterRange(range_from, range_to));
}
}
start = pos = block_end + 1;
}
- } else if (from() > 0 || to() < String::kMaxUC16CharCode) {
+ } else {
// Unibrow ranges don't work for high characters due to the "2^11 bug".
- // Therefore we do something dumber for these ranges. We don't bother
- // if the range is 0-max (as encountered at the start of an unanchored
- // regexp).
- ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);
- int bottom = from();
- int top = to();
- for (int i = bottom; i <= top; i++) {
- int length = uncanonicalize.get(i, '\0', chars);
- for (int j = 0; j < length; j++) {
- uc32 chr = chars[j];
- if (chr != i && chr < bottom || chr > top) {
- characters->Add(chr);
+ // Therefore we do something dumber for these ranges.
+ AddUncanonicals(ranges, bottom, top);
+ }
+}
+
+
+static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
+ int bottom,
+ int top) {
+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+ // Zones with no case mappings:
+ // 0x0600 - 0x0fff
+ // 0x1100 - 0x1cff
+ // 0x2000 - 0x20ff
+ // 0x2200 - 0x23ff
+ // 0x2500 - 0x2bff
+ // 0x2e00 - 0xa5ff
+ // 0xa800 - 0xfaff
+ // 0xfc00 - 0xfeff
+ const int boundary_count = 18;
+ // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this
+ // array. This is to split up big ranges and not because they actually denote
+ // a case-mapping-free-zone.
+ ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600);
+ const int kFirstRealCaselessZoneIndex = 2;
+ int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax,
+ 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500,
+ 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};
+
+ // Special ASCII rule from spec can save us some work here.
+ if (bottom == 0x80 && top == 0xffff) return;
+
+ // We have optimized support for this range.
+ if (top <= CharacterRange::kRangeCanonicalizeMax) {
+ CharacterRange range(bottom, top);
+ range.AddCaseEquivalents(ranges, false);
+ return;
+ }
+
+ // Split up very large ranges.
+ for (int i = 0; i < boundary_count; i++) {
+ if (bottom < boundaries[i] && top >= boundaries[i]) {
+ AddUncanonicals(ranges, bottom, boundaries[i] - 1);
+ AddUncanonicals(ranges, boundaries[i], top);
+ return;
+ }
+ }
+
+ // If we are completely in a zone with no case mappings then we are done.
+ // We start at 2 so as not to except the ASCII range from mappings.
+ for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) {
+ if (bottom >= boundaries[i] && top < boundaries[i + 1]) {
+#ifdef DEBUG
+ for (int j = bottom; j <= top; j++) {
+ unsigned current_char = j;
+ int length = uncanonicalize.get(current_char, '\0', chars);
+ for (int k = 0; k < length; k++) {
+ ASSERT(chars[k] == current_char);
}
}
+#endif
+ return;
}
- if (characters->length() > 0) {
- int new_from = characters->at(0);
- int new_to = new_from;
- for (int i = 1; i < characters->length(); i++) {
- int chr = characters->at(i);
- if (chr == new_to + 1) {
- new_to++;
+ }
+
+ // Step through the range finding equivalent characters.
+ ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);
+ for (int i = bottom; i <= top; i++) {
+ int length = uncanonicalize.get(i, '\0', chars);
+ for (int j = 0; j < length; j++) {
+ uc32 chr = chars[j];
+ if (chr != i && chr < bottom || chr > top) {
+ characters->Add(chr);
+ }
+ }
+ }
+
+ // Step through the equivalent characters finding simple ranges and
+ // adding ranges to the character class.
+ if (characters->length() > 0) {
+ int new_from = characters->at(0);
+ int new_to = new_from;
+ for (int i = 1; i < characters->length(); i++) {
+ int chr = characters->at(i);
+ if (chr == new_to + 1) {
+ new_to++;
+ } else {
+ if (new_to == new_from) {
+ ranges->Add(CharacterRange::Singleton(new_from));
} else {
- if (new_to == new_from) {
- ranges->Add(CharacterRange::Singleton(new_from));
- } else {
- ranges->Add(CharacterRange(new_from, new_to));
- }
- new_from = new_to = chr;
+ ranges->Add(CharacterRange(new_from, new_to));
}
+ new_from = new_to = chr;
}
- if (new_to == new_from) {
- ranges->Add(CharacterRange::Singleton(new_from));
- } else {
- ranges->Add(CharacterRange(new_from, new_to));
- }
}
+ if (new_to == new_from) {
+ ranges->Add(CharacterRange::Singleton(new_from));
+ } else {
+ ranges->Add(CharacterRange(new_from, new_to));
+ }
}
}
@@ -4271,7 +4349,7 @@
void Analysis::VisitText(TextNode* that) {
if (ignore_case_) {
- that->MakeCaseIndependent();
+ that->MakeCaseIndependent(is_ascii_);
}
EnsureAnalyzed(that->on_success());
if (!has_failed()) {
@@ -4489,7 +4567,7 @@
}
}
data->node = node;
- Analysis analysis(ignore_case);
+ Analysis analysis(ignore_case, is_ascii);
analysis.EnsureAnalyzed(node);
if (analysis.has_failed()) {
const char* error_message = analysis.error_message();
« no previous file with comments | « src/jsregexp.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698