Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(303)

Unified Diff: src/jsregexp.cc

Issue 21450: Irregexp:... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 11 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/jsregexp.h ('k') | src/objects.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/jsregexp.cc
===================================================================
--- src/jsregexp.cc (revision 1275)
+++ src/jsregexp.cc (working copy)
@@ -1739,19 +1739,51 @@
static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
+// Returns the number of characters in the equivalence class, omitting those
+// that cannot occur in the source string because it is ASCII.
+static int GetCaseIndependentLetters(uc16 character,
+ bool ascii_subject,
+ unibrow::uchar* letters) {
+ int length = uncanonicalize.get(character, '\0', letters);
+ // Unibrow returns 0 or 1 for characters where case independependence is
+ // trivial.
+ if (length == 0) {
+ letters[0] = character;
+ length = 1;
+ }
+ if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
+ return length;
+ }
+ // The standard requires that non-ASCII characters cannot have ASCII
+ // character codes in their equivalence class.
+ return 0;
+}
+
+
// Only emits non-letters (things that don't have case). Only used for case
// independent matches.
static inline bool EmitAtomNonLetter(
RegExpMacroAssembler* macro_assembler,
uc16 c,
+ bool ascii,
Label* on_failure,
int cp_offset,
bool check,
bool preloaded) {
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
- int length = uncanonicalize.get(c, '\0', chars);
+ int length = GetCaseIndependentLetters(c, ascii, chars);
+ if (length < 1) {
+ // This can't match. Must be an ASCII subject and a non-ASCII character.
+ // We do not need to do anything since the ASCII pass already handled this.
+ return false; // Bounds not checked.
+ }
bool checked = false;
- if (length <= 1) {
+ // We handle the length > 1 case in a later pass.
+ if (length == 1) {
+ if (ascii && c > String::kMaxAsciiCharCodeU) {
+ // Can't match - see above.
+ return false; // Bounds not checked.
+ }
if (!preloaded) {
macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
checked = check;
@@ -1805,14 +1837,14 @@
// matches.
static inline bool EmitAtomLetter(
RegExpMacroAssembler* macro_assembler,
+ uc16 c,
bool ascii,
- uc16 c,
Label* on_failure,
int cp_offset,
bool check,
bool preloaded) {
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
- int length = uncanonicalize.get(c, '\0', chars);
+ int length = GetCaseIndependentLetters(c, ascii, chars);
if (length <= 1) return false;
// We may not need to check against the end of the input string
// if this character lies before a character that matched.
@@ -1854,10 +1886,10 @@
static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
RegExpCharacterClass* cc,
+ bool ascii,
+ Label* on_failure,
int cp_offset,
- Label* on_failure,
bool check_offset,
- bool ascii,
bool preloaded) {
if (cc->is_standard() &&
macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
@@ -2238,12 +2270,14 @@
// matching can turn an ASCII character into non-ASCII or
// vice versa.
details->set_cannot_match();
+ pos->determines_perfectly = false;
Lasse Reichstein 2009/02/18 15:20:44 Should this be set to false? If the remaining case
Erik Corry 2009/02/18 16:04:24 Determines perfectly indicates that this particula
return;
}
if (compiler->ignore_case()) {
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
- int length = uncanonicalize.get(c, '\0', chars);
- if (length < 2) {
+ int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);
+ ASSERT(length != 0); // Can only happen if c > char_mask (see above).
+ if (length == 1) {
// This letter has no case equivalents, so it's nice and simple
// and the mask-compare will determine definitely whether we have
// a match at this character position.
@@ -2288,7 +2322,6 @@
details->positions(characters_filled_in);
RegExpCharacterClass* tree = elm.data.u_char_class;
ZoneList<CharacterRange>* ranges = tree->ranges();
- CharacterRange range = ranges->at(0);
if (tree->is_negated()) {
// A quick check uses multi-character mask and compare. There is no
// useful way to incorporate a negative char class into this scheme
@@ -2297,27 +2330,46 @@
pos->mask = 0;
pos->value = 0;
} else {
- uint32_t differing_bits = (range.from() ^ range.to());
+ int first_range = 0;
+ while (ranges->at(first_range).from() > char_mask) {
+ first_range++;
+ if (first_range == ranges->length()) {
+ details->set_cannot_match();
+ pos->determines_perfectly = false;
+ return;
+ }
+ }
+ CharacterRange range = ranges->at(first_range);
+ uc16 from = range.from();
+ uc16 to = range.to();
+ if (to > char_mask) {
+ to = char_mask;
+ }
+ uint32_t differing_bits = (from ^ to);
// A mask and compare is only perfect if the differing bits form a
// number like 00011111 with one single block of trailing 1s.
if ((differing_bits & (differing_bits + 1)) == 0) {
pos->determines_perfectly = true;
}
uint32_t common_bits = ~SmearBitsRight(differing_bits);
- uint32_t bits = (range.from() & common_bits);
- for (int i = 1; i < ranges->length(); i++) {
+ uint32_t bits = (from & common_bits);
+ for (int i = first_range + 1; i < ranges->length(); i++) {
+ CharacterRange range = ranges->at(i);
+ uc16 from = range.from();
+ uc16 to = range.to();
+ if (from > char_mask) continue;
+ if (to > char_mask) to = char_mask;
// Here we are combining more ranges into the mask and compare
// value. With each new range the mask becomes more sparse and
// so the chances of a false positive rise. A character class
// with multiple ranges is assumed never to be equivalent to a
// mask and compare operation.
pos->determines_perfectly = false;
- CharacterRange range = ranges->at(i);
- uint32_t new_common_bits = (range.from() ^ range.to());
+ uint32_t new_common_bits = (from ^ to);
new_common_bits = ~SmearBitsRight(new_common_bits);
common_bits &= new_common_bits;
bits &= new_common_bits;
- uint32_t differing_bits = (range.from() & common_bits) ^ bits;
+ uint32_t differing_bits = (from & common_bits) ^ bits;
common_bits ^= differing_bits;
bits &= common_bits;
}
@@ -2619,6 +2671,20 @@
}
+static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
+ if (quick_check == NULL) return false;
+ if (offset >= quick_check->characters()) return false;
+ return quick_check->positions(offset)->determines_perfectly;
Lasse Reichstein 2009/02/18 15:20:44 Just checking: If we find that a position determin
Erik Corry 2009/02/18 16:04:24 See above.
+}
+
+
+static void UpdateBoundsCheck(int index, int* checked_up_to) {
+ if (index > *checked_up_to) {
+ *checked_up_to = index;
+ }
+}
+
+
// We call this repeatedly to generate code for each pass over the text node.
// The passes are in increasing order of difficulty because we hope one
// of the first passes will fail in which case we are saved the work of the
@@ -2663,83 +2729,68 @@
TextElement elm = elms_->at(i);
int cp_offset = trace->cp_offset() + elm.cp_offset;
if (elm.type == TextElement::ATOM) {
- if (pass == NON_ASCII_MATCH ||
- pass == CHARACTER_MATCH ||
- pass == CASE_CHARACTER_MATCH) {
+ if (pass != CHARACTER_CLASS_MATCH) {
Vector<const uc16> quarks = elm.data.u_atom->data();
for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
- bool bound_checked = true; // Most ops will check their bounds.
+ bool bound_checked = false;
if (first_element_checked && i == 0 && j == 0) continue;
- if (pass == NON_ASCII_MATCH) {
- ASSERT(ascii);
- if (quarks[j] > String::kMaxAsciiCharCode) {
- assembler->GoTo(backtrack);
- return;
- }
- } else {
- if (quick_check != NULL &&
- elm.cp_offset + j < quick_check->characters() &&
- quick_check->positions(elm.cp_offset + j)->
- determines_perfectly) {
- continue;
- }
- if (pass == CHARACTER_MATCH) {
- if (compiler->ignore_case()) {
- bound_checked = EmitAtomNonLetter(
- assembler,
- quarks[j],
+ if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
+ switch (pass) {
+ case NON_ASCII_MATCH:
+ ASSERT(ascii);
+ if (quarks[j] > String::kMaxAsciiCharCode) {
+ assembler->GoTo(backtrack);
+ return;
+ }
+ break;
+ case NON_LETTER_CHARACTER_MATCH:
+ bound_checked = EmitAtomNonLetter(assembler,
+ quarks[j],
+ ascii,
+ backtrack,
+ cp_offset + j,
+ *checked_up_to < cp_offset + j,
+ preloaded);
+ break;
+ case SIMPLE_CHARACTER_MATCH:
+ if (!preloaded) {
+ assembler->LoadCurrentCharacter(
+ cp_offset + j,
backtrack,
- cp_offset + j,
- *checked_up_to < cp_offset + j,
- preloaded);
- } else {
- if (!preloaded) {
- assembler->LoadCurrentCharacter(
- cp_offset + j,
- backtrack,
- *checked_up_to < cp_offset + j);
- }
- assembler->CheckNotCharacter(quarks[j], backtrack);
+ *checked_up_to < cp_offset + j);
+ bound_checked = true;
}
- } else {
- ASSERT_EQ(pass, CASE_CHARACTER_MATCH);
- ASSERT(compiler->ignore_case());
+ assembler->CheckNotCharacter(quarks[j], backtrack);
+ break;
+ case CASE_CHARACTER_MATCH:
bound_checked = EmitAtomLetter(assembler,
- compiler->ascii(),
quarks[j],
+ ascii,
backtrack,
cp_offset + j,
*checked_up_to < cp_offset + j,
preloaded);
- }
- if (bound_checked) {
- if (cp_offset + j > *checked_up_to) {
- *checked_up_to = cp_offset + j;
- }
- }
+ break;
+ default:
+ break;
}
+ if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
}
}
} else {
ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
- if (first_element_checked && i == 0) continue;
- if (quick_check != NULL &&
- elm.cp_offset < quick_check->characters() &&
- quick_check->positions(elm.cp_offset)->determines_perfectly) {
- continue;
- }
if (pass == CHARACTER_CLASS_MATCH) {
+ if (first_element_checked && i == 0) continue;
+ if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
RegExpCharacterClass* cc = elm.data.u_char_class;
EmitCharClass(assembler,
cc,
+ ascii,
+ backtrack,
cp_offset,
- backtrack,
*checked_up_to < cp_offset,
- ascii,
preloaded);
- if (cp_offset > *checked_up_to) {
- *checked_up_to = cp_offset;
- }
+ UpdateBoundsCheck(cp_offset, checked_up_to);
}
}
}
@@ -2757,6 +2808,16 @@
}
+bool TextNode::SkipPass(int int_pass, bool ignore_case) {
+ TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
+ if (ignore_case) {
+ return pass == SIMPLE_CHARACTER_MATCH;
+ } else {
+ return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
+ }
+}
+
+
// This generates the code to match a text node. A text node can contain
// straight character sequences (possibly to be matched in a case-independent
// way) and character classes. For efficiency we do not do this in a single
@@ -2785,50 +2846,30 @@
// If a character is preloaded into the current character register then
// check that now.
if (trace->characters_preloaded() == 1) {
- TextEmitPass(compiler,
- CHARACTER_MATCH,
- true,
- trace,
- false,
- &bound_checked_to);
- if (compiler->ignore_case()) {
+ for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
+ if (!SkipPass(pass, compiler->ignore_case())) {
+ TextEmitPass(compiler,
+ static_cast<TextEmitPassType>(pass),
+ true,
+ trace,
+ false,
+ &bound_checked_to);
+ }
+ }
+ first_elt_done = true;
+ }
+
+ for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
+ if (!SkipPass(pass, compiler->ignore_case())) {
TextEmitPass(compiler,
- CASE_CHARACTER_MATCH,
- true,
+ static_cast<TextEmitPassType>(pass),
+ false,
trace,
- false,
+ first_elt_done,
&bound_checked_to);
}
- TextEmitPass(compiler,
- CHARACTER_CLASS_MATCH,
- true,
- trace,
- false,
- &bound_checked_to);
- first_elt_done = true;
}
- TextEmitPass(compiler,
- CHARACTER_MATCH,
- false,
- trace,
- first_elt_done,
- &bound_checked_to);
- if (compiler->ignore_case()) {
- TextEmitPass(compiler,
- CASE_CHARACTER_MATCH,
- false,
- trace,
- first_elt_done,
- &bound_checked_to);
- }
- TextEmitPass(compiler,
- CHARACTER_CLASS_MATCH,
- false,
- trace,
- first_elt_done,
- &bound_checked_to);
-
Trace successor_trace(*trace);
successor_trace.set_at_start(false);
successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
« no previous file with comments | « src/jsregexp.h ('k') | src/objects.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698