Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(5)

Unified Diff: src/jsregexp.cc

Issue 11349: Character range uncanonicalization. (Closed)
Patch Set: Created 12 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/jsregexp.h ('k') | src/unicode.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/jsregexp.cc
diff --git a/src/jsregexp.cc b/src/jsregexp.cc
index 862040a582c4037fe1d7c34b56de3b5e97968853..0ea417fc92a37821edf14ab711f10719c7768762 100644
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@@ -1874,6 +1874,90 @@ void CharacterRange::AddClassEscape(uc16 type,
}
+static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
+static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
+
+
+void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+ if (IsSingleton()) {
+ // If this is a singleton we just expand the one character.
+ int length = uncanonicalize.get(from(), '\0', chars);
+ for (int i = 0; i < length; i++) {
+ uc32 chr = chars[i];
+ if (chr != from()) {
+ ranges->Add(CharacterRange::Singleton(chars[i]));
+ }
+ }
+ } else if (from() <= kRangeCanonicalizeMax
+ && to() <= kRangeCanonicalizeMax) {
+ // If this is a range we expand the characters block by block,
+ // expanding contiguous subranges (blocks) one at a time.
+ // The approach is as follows. For a given start character we
+ // look up the block that contains it, for instance 'a' if the
+ // start character is 'c'. A block is characterized by the property
+ // that all characters uncanonicalize in the same way as the first
+ // element, except that each entry in the result is incremented
+ // by the distance from the first element. So a-z is a block
+ // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
+ // uncanonicalizes to ['a' + k, 'A' + k].
+ // Once we've found the start point we look up its uncanonicalization
+ // and produce a range for each element. For instance for [c-f]
+ // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
+ // add a range if it is not already contained in the input, so [c-f]
+ // will be skipped but [C-F] will be added. If this range is not
+ // completely contained in a block we do this for all the blocks
+ // covered by the range.
+ unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+ // First, look up the block that contains the 'from' character.
+ int length = canonrange.get(from(), '\0', range);
+ if (length == 0) {
+ range[0] = from();
+ } else {
+ ASSERT_EQ(1, length);
+ }
+ int pos = from();
+ // The start of the current block. Note that except for the first
+ // iteration 'start' is always equal to 'pos'.
+ int start;
+ // If it is not the start point of a block the entry contains the
+ // offset of the character from the start point.
+ if ((range[0] & kStartMarker) == 0) {
+ start = pos - range[0];
+ } else {
+ start = pos;
+ }
+ // Then we add the ranges on at a time, incrementing the current
+ // position to be after the last block each time. The position
+ // always points to the start of a block.
+ while (pos < to()) {
+ length = canonrange.get(start, '\0', range);
+ if (length == 0) {
+ range[0] = start;
+ } else {
+ ASSERT_EQ(1, length);
+ }
+ ASSERT((range[0] & kStartMarker) != 0);
+ // The start point of a block contains the distance to the end
+ // of the range.
+ int block_end = start + (range[0] & kPayloadMask) - 1;
+ int end = (block_end > to()) ? to() : block_end;
+ length = uncanonicalize.get(start, '\0', range);
+ for (int i = 0; i < length; i++) {
+ uc32 c = range[i];
+ uc16 range_from = c + (pos - start);
+ uc16 range_to = c + (end - start);
+ if (!(from() <= range_from && range_to <= to()))
+ ranges->Add(CharacterRange(range_from, range_to));
+ }
+ start = pos = block_end + 1;
+ }
+ } else {
+ // TODO when we've fixed the 2^11 bug in unibrow.
+ }
+}
+
+
// -------------------------------------------------------------------
// Interest propagation
« no previous file with comments | « src/jsregexp.h ('k') | src/unicode.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698