src/jsregexp.cc - Issue 21450: Irregexp:...

Unified Diff: src/jsregexp.cc

Issue 21450: Irregexp:... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 11 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/jsregexp.cc

===================================================================

--- src/jsregexp.cc (revision 1275)

+++ src/jsregexp.cc (working copy)

@@ -1739,19 +1739,51 @@

static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;

+// Returns the number of characters in the equivalence class, omitting those

+// that cannot occur in the source string because it is ASCII.

+static int GetCaseIndependentLetters(uc16 character,

+ bool ascii_subject,

+ unibrow::uchar* letters) {

+ int length = uncanonicalize.get(character, '\0', letters);

+ // Unibrow returns 0 or 1 for characters where case independependence is

+ // trivial.

+ if (length == 0) {

+ letters[0] = character;

+ length = 1;

+ }

+ if (!ascii_subject || character <= String::kMaxAsciiCharCode) {

+ return length;

+ }

+ // The standard requires that non-ASCII characters cannot have ASCII

+ // character codes in their equivalence class.

+ return 0;

// Only emits non-letters (things that don't have case). Only used for case

// independent matches.

static inline bool EmitAtomNonLetter(

RegExpMacroAssembler* macro_assembler,

uc16 c,

+ bool ascii,

Label* on_failure,

int cp_offset,

bool check,

bool preloaded) {

unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

- int length = uncanonicalize.get(c, '\0', chars);

+ int length = GetCaseIndependentLetters(c, ascii, chars);

+ if (length < 1) {

+ // This can't match. Must be an ASCII subject and a non-ASCII character.

+ // We do not need to do anything since the ASCII pass already handled this.

+ return false; // Bounds not checked.

+ }

bool checked = false;

- if (length <= 1) {

+ // We handle the length > 1 case in a later pass.

+ if (length == 1) {

+ if (ascii && c > String::kMaxAsciiCharCodeU) {

+ // Can't match - see above.

+ return false; // Bounds not checked.

+ }

if (!preloaded) {

macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

checked = check;

@@ -1805,14 +1837,14 @@

// matches.

static inline bool EmitAtomLetter(

RegExpMacroAssembler* macro_assembler,

+ uc16 c,

bool ascii,

- uc16 c,

Label* on_failure,

int cp_offset,

bool check,

bool preloaded) {

unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

- int length = uncanonicalize.get(c, '\0', chars);

+ int length = GetCaseIndependentLetters(c, ascii, chars);

if (length <= 1) return false;

// We may not need to check against the end of the input string

// if this character lies before a character that matched.

@@ -1854,10 +1886,10 @@

static void EmitCharClass(RegExpMacroAssembler* macro_assembler,

RegExpCharacterClass* cc,

+ bool ascii,

+ Label* on_failure,

int cp_offset,

- Label* on_failure,

bool check_offset,

- bool ascii,

bool preloaded) {

if (cc->is_standard() &&

macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),

@@ -2238,12 +2270,14 @@

// matching can turn an ASCII character into non-ASCII or

// vice versa.

details->set_cannot_match();

+ pos->determines_perfectly = false;

Lasse Reichstein 2009/02/18 15:20:44 Should this be set to false? If the remaining case

Erik Corry 2009/02/18 16:04:24 Determines perfectly indicates that this particula

return;

}

if (compiler->ignore_case()) {

unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

- int length = uncanonicalize.get(c, '\0', chars);

- if (length < 2) {

+ int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);

+ ASSERT(length != 0); // Can only happen if c > char_mask (see above).

+ if (length == 1) {

// This letter has no case equivalents, so it's nice and simple

// and the mask-compare will determine definitely whether we have

// a match at this character position.

@@ -2288,7 +2322,6 @@

details->positions(characters_filled_in);

RegExpCharacterClass* tree = elm.data.u_char_class;

ZoneList<CharacterRange>* ranges = tree->ranges();

- CharacterRange range = ranges->at(0);

if (tree->is_negated()) {

// A quick check uses multi-character mask and compare. There is no

// useful way to incorporate a negative char class into this scheme

@@ -2297,27 +2330,46 @@

pos->mask = 0;

pos->value = 0;

} else {

- uint32_t differing_bits = (range.from() ^ range.to());

+ int first_range = 0;

+ while (ranges->at(first_range).from() > char_mask) {

+ first_range++;

+ if (first_range == ranges->length()) {

+ details->set_cannot_match();

+ pos->determines_perfectly = false;

+ return;

+ }

+ CharacterRange range = ranges->at(first_range);

+ uc16 from = range.from();

+ uc16 to = range.to();

+ if (to > char_mask) {

+ to = char_mask;

+ }

+ uint32_t differing_bits = (from ^ to);

// A mask and compare is only perfect if the differing bits form a

// number like 00011111 with one single block of trailing 1s.

if ((differing_bits & (differing_bits + 1)) == 0) {

pos->determines_perfectly = true;

}

uint32_t common_bits = ~SmearBitsRight(differing_bits);

- uint32_t bits = (range.from() & common_bits);

- for (int i = 1; i < ranges->length(); i++) {

+ uint32_t bits = (from & common_bits);

+ for (int i = first_range + 1; i < ranges->length(); i++) {

+ CharacterRange range = ranges->at(i);

+ uc16 from = range.from();

+ uc16 to = range.to();

+ if (from > char_mask) continue;

+ if (to > char_mask) to = char_mask;

// Here we are combining more ranges into the mask and compare

// value. With each new range the mask becomes more sparse and

// so the chances of a false positive rise. A character class

// with multiple ranges is assumed never to be equivalent to a

// mask and compare operation.

pos->determines_perfectly = false;

- CharacterRange range = ranges->at(i);

- uint32_t new_common_bits = (range.from() ^ range.to());

+ uint32_t new_common_bits = (from ^ to);

new_common_bits = ~SmearBitsRight(new_common_bits);

common_bits &= new_common_bits;

bits &= new_common_bits;

- uint32_t differing_bits = (range.from() & common_bits) ^ bits;

+ uint32_t differing_bits = (from & common_bits) ^ bits;

common_bits ^= differing_bits;

bits &= common_bits;

}

@@ -2619,6 +2671,20 @@

}

+static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {

+ if (quick_check == NULL) return false;

+ if (offset >= quick_check->characters()) return false;

+ return quick_check->positions(offset)->determines_perfectly;

Lasse Reichstein 2009/02/18 15:20:44 Just checking: If we find that a position determin

Erik Corry 2009/02/18 16:04:24 See above.

+static void UpdateBoundsCheck(int index, int* checked_up_to) {

+ if (index > *checked_up_to) {

+ *checked_up_to = index;

+ }

// We call this repeatedly to generate code for each pass over the text node.

// The passes are in increasing order of difficulty because we hope one

// of the first passes will fail in which case we are saved the work of the

@@ -2663,83 +2729,68 @@

TextElement elm = elms_->at(i);

int cp_offset = trace->cp_offset() + elm.cp_offset;

if (elm.type == TextElement::ATOM) {

- if (pass == NON_ASCII_MATCH ||

- pass == CHARACTER_MATCH ||

- pass == CASE_CHARACTER_MATCH) {

+ if (pass != CHARACTER_CLASS_MATCH) {

Vector<const uc16> quarks = elm.data.u_atom->data();

for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {

- bool bound_checked = true; // Most ops will check their bounds.

+ bool bound_checked = false;

if (first_element_checked && i == 0 && j == 0) continue;

- if (pass == NON_ASCII_MATCH) {

- ASSERT(ascii);

- if (quarks[j] > String::kMaxAsciiCharCode) {

- assembler->GoTo(backtrack);

- return;

- }

- } else {

- if (quick_check != NULL &&

- elm.cp_offset + j < quick_check->characters() &&

- quick_check->positions(elm.cp_offset + j)->

- determines_perfectly) {

- continue;

- }

- if (pass == CHARACTER_MATCH) {

- if (compiler->ignore_case()) {

- bound_checked = EmitAtomNonLetter(

- assembler,

- quarks[j],

+ if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;

+ switch (pass) {

+ case NON_ASCII_MATCH:

+ ASSERT(ascii);

+ if (quarks[j] > String::kMaxAsciiCharCode) {

+ assembler->GoTo(backtrack);

+ return;

+ }

+ break;

+ case NON_LETTER_CHARACTER_MATCH:

+ bound_checked = EmitAtomNonLetter(assembler,

+ quarks[j],

+ ascii,

+ backtrack,

+ cp_offset + j,

+ *checked_up_to < cp_offset + j,

+ preloaded);

+ break;

+ case SIMPLE_CHARACTER_MATCH:

+ if (!preloaded) {

+ assembler->LoadCurrentCharacter(

+ cp_offset + j,

backtrack,

- cp_offset + j,

- *checked_up_to < cp_offset + j,

- preloaded);

- } else {

- if (!preloaded) {

- assembler->LoadCurrentCharacter(

- cp_offset + j,

- backtrack,

- *checked_up_to < cp_offset + j);

- }

- assembler->CheckNotCharacter(quarks[j], backtrack);

+ *checked_up_to < cp_offset + j);

+ bound_checked = true;

}

- } else {

- ASSERT_EQ(pass, CASE_CHARACTER_MATCH);

- ASSERT(compiler->ignore_case());

+ assembler->CheckNotCharacter(quarks[j], backtrack);

+ break;

+ case CASE_CHARACTER_MATCH:

bound_checked = EmitAtomLetter(assembler,

- compiler->ascii(),

quarks[j],

+ ascii,

backtrack,

cp_offset + j,

*checked_up_to < cp_offset + j,

preloaded);

- }

- if (bound_checked) {

- if (cp_offset + j > *checked_up_to) {

- *checked_up_to = cp_offset + j;

- }

+ break;

+ default:

+ break;

}

+ if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);

}

} else {

ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);

- if (first_element_checked && i == 0) continue;

- if (quick_check != NULL &&

- elm.cp_offset < quick_check->characters() &&

- quick_check->positions(elm.cp_offset)->determines_perfectly) {

- continue;

- }

if (pass == CHARACTER_CLASS_MATCH) {

+ if (first_element_checked && i == 0) continue;

+ if (DeterminedAlready(quick_check, elm.cp_offset)) continue;

RegExpCharacterClass* cc = elm.data.u_char_class;

EmitCharClass(assembler,

cc,

+ ascii,

+ backtrack,

cp_offset,

- backtrack,

*checked_up_to < cp_offset,

- ascii,

preloaded);

- if (cp_offset > *checked_up_to) {

- *checked_up_to = cp_offset;

- }

+ UpdateBoundsCheck(cp_offset, checked_up_to);

}

@@ -2757,6 +2808,16 @@

}

+bool TextNode::SkipPass(int int_pass, bool ignore_case) {

+ TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);

+ if (ignore_case) {

+ return pass == SIMPLE_CHARACTER_MATCH;

+ } else {

+ return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;

+ }

// This generates the code to match a text node. A text node can contain

// straight character sequences (possibly to be matched in a case-independent

// way) and character classes. For efficiency we do not do this in a single

@@ -2785,50 +2846,30 @@

// If a character is preloaded into the current character register then

// check that now.

if (trace->characters_preloaded() == 1) {

- TextEmitPass(compiler,

- CHARACTER_MATCH,

- true,

- trace,

- false,

- &bound_checked_to);

- if (compiler->ignore_case()) {

+ for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {

+ if (!SkipPass(pass, compiler->ignore_case())) {

+ TextEmitPass(compiler,

+ static_cast<TextEmitPassType>(pass),

+ true,

+ trace,

+ false,

+ &bound_checked_to);

+ }

+ first_elt_done = true;

+ }

+ for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {

+ if (!SkipPass(pass, compiler->ignore_case())) {

TextEmitPass(compiler,

- CASE_CHARACTER_MATCH,

- true,

+ static_cast<TextEmitPassType>(pass),

+ false,

trace,

- false,

+ first_elt_done,

&bound_checked_to);

}

- TextEmitPass(compiler,

- CHARACTER_CLASS_MATCH,

- true,

- trace,

- false,

- &bound_checked_to);

- first_elt_done = true;

}

- TextEmitPass(compiler,

- CHARACTER_MATCH,

- false,

- trace,

- first_elt_done,

- &bound_checked_to);

- if (compiler->ignore_case()) {

- TextEmitPass(compiler,

- CASE_CHARACTER_MATCH,

- false,

- trace,

- first_elt_done,

- &bound_checked_to);

- }

- TextEmitPass(compiler,

- CHARACTER_CLASS_MATCH,

- false,

- trace,

- first_elt_done,

- &bound_checked_to);

Trace successor_trace(*trace);

successor_trace.set_at_start(false);

successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);

« no previous file with comments | « src/jsregexp.h ('k') | src/objects.h » ('j') | no next file with comments »