src/jsregexp.cc - Issue 21126: Fix some glitches around unicode regexps matching ascii subject strings.

Unified Diff: src/jsregexp.cc

Issue 21126: Fix some glitches around unicode regexps matching ascii subject strings. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 11 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/jsregexp.cc

===================================================================

--- src/jsregexp.cc (revision 1235)

+++ src/jsregexp.cc (working copy)

@@ -1735,19 +1735,59 @@

static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;

+// Returns the number of characters in the equivalence class, omitting those

+// that cannot occur in the source string because it is ASCII.

+static int GetCaseIndependentLetters(uc16 character,

+ bool ascii_subject,

+ unibrow::uchar* letters) {

+ int length = uncanonicalize.get(character, '\0', letters);

+ if (length < 2) {

+ // Unibrow returns 0 or 1 for characters where case independependence is

+ // trivial.

+ if (ascii_subject && character > String::kMaxAsciiCharCode) {

+ return 0; // It's impossible to find a match here.

+ }

+ letters[0] = character;

+ return 1;

+ }

+ if (!ascii_subject) {

+ return length;

+ }

+ int characters = 0;

+ for (int i = 0; i < length; i++) {

+ if (letters[i] <= String::kMaxAsciiCharCodeU) {

+ letters[characters++] = letters[i];

+ }

+ return characters;

Lasse Reichstein 2009/02/06 15:50:56 If uncanonicalize implements Canonicalize^-1 o Can

// Only emits non-letters (things that don't have case). Only used for case

// independent matches.

static inline bool EmitAtomNonLetter(

RegExpMacroAssembler* macro_assembler,

uc16 c,

+ bool ascii,

Label* on_failure,

int cp_offset,

bool check,

bool preloaded) {

unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

- int length = uncanonicalize.get(c, '\0', chars);

+ int length = GetCaseIndependentLetters(c, ascii, chars);

+ if (length < 1) {

+ // Can't match.

+ macro_assembler->GoTo(on_failure);

+ return false; // Bounds not checked.

+ }

bool checked = false;

- if (length <= 1) {

+ // We handle the length > 1 case in another pass.

+ if (length == 1) {

Lasse Reichstein 2009/02/06 15:50:56 What about the length == 0 case? Could we emit a

+ if (ascii && c > String::kMaxAsciiCharCodeU) {

Lasse Reichstein 2009/02/06 15:50:56 Wasn't this checked in line 1747, so length will a

+ // Can't match.

+ macro_assembler->GoTo(on_failure);

+ return false; // Bounds not checked.

+ }

if (!preloaded) {

macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

checked = check;

@@ -1808,7 +1848,7 @@

bool check,

bool preloaded) {

unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

- int length = uncanonicalize.get(c, '\0', chars);

+ int length = GetCaseIndependentLetters(c, ascii, chars);

if (length <= 1) return false;

// We may not need to check against the end of the input string

// if this character lies before a character that matched.

@@ -2230,8 +2270,13 @@

if (compiler->ignore_case()) {

unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

uc16 c = quarks[i];

- int length = uncanonicalize.get(c, '\0', chars);

- if (length < 2) {

+ int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);

+ if (length == 0) {

+ details->set_cannot_match();

+ pos->determines_perfectly = false;

+ return;

+ }

+ if (length == 1) {

// This letter has no case equivalents, so it's nice and simple

// and the mask-compare will determine definitely whether we have

// a match at this character position.

@@ -2261,6 +2306,11 @@

// Don't ignore case. Nice simple case where the mask-compare will

// determine definitely whether we have a match at this character

// position.

+ if (quarks[i] > char_mask) {

+ pos->determines_perfectly = false;

+ details->set_cannot_match();

+ return;

+ }

pos->mask = char_mask;

pos->value = quarks[i];

pos->determines_perfectly = true;

@@ -2276,7 +2326,6 @@

details->positions(characters_filled_in);

RegExpCharacterClass* tree = elm.data.u_char_class;

ZoneList<CharacterRange>* ranges = tree->ranges();

- CharacterRange range = ranges->at(0);

if (tree->is_negated()) {

// A quick check uses multi-character mask and compare. There is no

// useful way to incorporate a negative char class into this scheme

@@ -2285,27 +2334,46 @@

pos->mask = 0;

pos->value = 0;

} else {

- uint32_t differing_bits = (range.from() ^ range.to());

+ int first_range = 0;

+ while (ranges->at(first_range).from() > char_mask) {

+ first_range++;

+ if (first_range == ranges->length()) {

+ details->set_cannot_match();

+ pos->determines_perfectly = false;

+ return;

+ }

+ CharacterRange range = ranges->at(first_range);

+ uc16 from = range.from();

+ uc16 to = range.to();

+ if (to > char_mask) {

+ to = char_mask;

+ }

+ uint32_t differing_bits = (from ^ to);

// A mask and compare is only perfect if the differing bits form a

// number like 00011111 with one single block of trailing 1s.

if ((differing_bits & (differing_bits + 1)) == 0) {

pos->determines_perfectly = true;

}

uint32_t common_bits = ~SmearBitsRight(differing_bits);

- uint32_t bits = (range.from() & common_bits);

- for (int i = 1; i < ranges->length(); i++) {

+ uint32_t bits = (from & common_bits);

+ for (int i = first_range + 1; i < ranges->length(); i++) {

+ CharacterRange range = ranges->at(i);

+ uc16 from = range.from();

+ uc16 to = range.to();

+ if (from > char_mask) continue;

+ if (to > char_mask) to = char_mask;

// Here we are combining more ranges into the mask and compare

// value. With each new range the mask becomes more sparse and

// so the chances of a false positive rise. A character class

// with multiple ranges is assumed never to be equivalent to a

// mask and compare operation.

pos->determines_perfectly = false;

- CharacterRange range = ranges->at(i);

- uint32_t new_common_bits = (range.from() ^ range.to());

+ uint32_t new_common_bits = (from ^ to);

new_common_bits = ~SmearBitsRight(new_common_bits);

common_bits &= new_common_bits;

bits &= new_common_bits;

- uint32_t differing_bits = (range.from() & common_bits) ^ bits;

+ uint32_t differing_bits = (from & common_bits) ^ bits;

common_bits ^= differing_bits;

bits &= common_bits;

}

@@ -2673,6 +2741,7 @@

if (compiler->ignore_case()) {

bound_checked = EmitAtomNonLetter(assembler,

quarks[j],

+ ascii,

backtrack,

cp_offset + j,

*checked_up_to < cp_offset + j,

« no previous file with comments | « no previous file | src/objects.h » ('j') | src/objects.h » ('J')