Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(133)

Unified Diff: src/jsregexp.cc

Issue 21126: Fix some glitches around unicode regexps matching ascii subject strings. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 11 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | src/objects.h » ('j') | src/objects.h » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/jsregexp.cc
===================================================================
--- src/jsregexp.cc (revision 1235)
+++ src/jsregexp.cc (working copy)
@@ -1735,19 +1735,59 @@
static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
+// Returns the number of characters in the equivalence class, omitting those
+// that cannot occur in the source string because it is ASCII.
+static int GetCaseIndependentLetters(uc16 character,
+ bool ascii_subject,
+ unibrow::uchar* letters) {
+ int length = uncanonicalize.get(character, '\0', letters);
+ if (length < 2) {
+ // Unibrow returns 0 or 1 for characters where case independependence is
+ // trivial.
+ if (ascii_subject && character > String::kMaxAsciiCharCode) {
+ return 0; // It's impossible to find a match here.
+ }
+ letters[0] = character;
+ return 1;
+ }
+ if (!ascii_subject) {
+ return length;
+ }
+ int characters = 0;
+ for (int i = 0; i < length; i++) {
+ if (letters[i] <= String::kMaxAsciiCharCodeU) {
+ letters[characters++] = letters[i];
+ }
+ }
+ return characters;
+}
Lasse Reichstein 2009/02/06 15:50:56 If uncanonicalize implements Canonicalize^-1 o Can
+
+
// Only emits non-letters (things that don't have case). Only used for case
// independent matches.
static inline bool EmitAtomNonLetter(
RegExpMacroAssembler* macro_assembler,
uc16 c,
+ bool ascii,
Label* on_failure,
int cp_offset,
bool check,
bool preloaded) {
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
- int length = uncanonicalize.get(c, '\0', chars);
+ int length = GetCaseIndependentLetters(c, ascii, chars);
+ if (length < 1) {
+ // Can't match.
+ macro_assembler->GoTo(on_failure);
+ return false; // Bounds not checked.
+ }
bool checked = false;
- if (length <= 1) {
+ // We handle the length > 1 case in another pass.
+ if (length == 1) {
Lasse Reichstein 2009/02/06 15:50:56 What about the length == 0 case? Could we emit a
+ if (ascii && c > String::kMaxAsciiCharCodeU) {
Lasse Reichstein 2009/02/06 15:50:56 Wasn't this checked in line 1747, so length will a
+ // Can't match.
+ macro_assembler->GoTo(on_failure);
+ return false; // Bounds not checked.
+ }
if (!preloaded) {
macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
checked = check;
@@ -1808,7 +1848,7 @@
bool check,
bool preloaded) {
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
- int length = uncanonicalize.get(c, '\0', chars);
+ int length = GetCaseIndependentLetters(c, ascii, chars);
if (length <= 1) return false;
// We may not need to check against the end of the input string
// if this character lies before a character that matched.
@@ -2230,8 +2270,13 @@
if (compiler->ignore_case()) {
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
uc16 c = quarks[i];
- int length = uncanonicalize.get(c, '\0', chars);
- if (length < 2) {
+ int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);
+ if (length == 0) {
+ details->set_cannot_match();
+ pos->determines_perfectly = false;
+ return;
+ }
+ if (length == 1) {
// This letter has no case equivalents, so it's nice and simple
// and the mask-compare will determine definitely whether we have
// a match at this character position.
@@ -2261,6 +2306,11 @@
// Don't ignore case. Nice simple case where the mask-compare will
// determine definitely whether we have a match at this character
// position.
+ if (quarks[i] > char_mask) {
+ pos->determines_perfectly = false;
+ details->set_cannot_match();
+ return;
+ }
pos->mask = char_mask;
pos->value = quarks[i];
pos->determines_perfectly = true;
@@ -2276,7 +2326,6 @@
details->positions(characters_filled_in);
RegExpCharacterClass* tree = elm.data.u_char_class;
ZoneList<CharacterRange>* ranges = tree->ranges();
- CharacterRange range = ranges->at(0);
if (tree->is_negated()) {
// A quick check uses multi-character mask and compare. There is no
// useful way to incorporate a negative char class into this scheme
@@ -2285,27 +2334,46 @@
pos->mask = 0;
pos->value = 0;
} else {
- uint32_t differing_bits = (range.from() ^ range.to());
+ int first_range = 0;
+ while (ranges->at(first_range).from() > char_mask) {
+ first_range++;
+ if (first_range == ranges->length()) {
+ details->set_cannot_match();
+ pos->determines_perfectly = false;
+ return;
+ }
+ }
+ CharacterRange range = ranges->at(first_range);
+ uc16 from = range.from();
+ uc16 to = range.to();
+ if (to > char_mask) {
+ to = char_mask;
+ }
+ uint32_t differing_bits = (from ^ to);
// A mask and compare is only perfect if the differing bits form a
// number like 00011111 with one single block of trailing 1s.
if ((differing_bits & (differing_bits + 1)) == 0) {
pos->determines_perfectly = true;
}
uint32_t common_bits = ~SmearBitsRight(differing_bits);
- uint32_t bits = (range.from() & common_bits);
- for (int i = 1; i < ranges->length(); i++) {
+ uint32_t bits = (from & common_bits);
+ for (int i = first_range + 1; i < ranges->length(); i++) {
+ CharacterRange range = ranges->at(i);
+ uc16 from = range.from();
+ uc16 to = range.to();
+ if (from > char_mask) continue;
+ if (to > char_mask) to = char_mask;
// Here we are combining more ranges into the mask and compare
// value. With each new range the mask becomes more sparse and
// so the chances of a false positive rise. A character class
// with multiple ranges is assumed never to be equivalent to a
// mask and compare operation.
pos->determines_perfectly = false;
- CharacterRange range = ranges->at(i);
- uint32_t new_common_bits = (range.from() ^ range.to());
+ uint32_t new_common_bits = (from ^ to);
new_common_bits = ~SmearBitsRight(new_common_bits);
common_bits &= new_common_bits;
bits &= new_common_bits;
- uint32_t differing_bits = (range.from() & common_bits) ^ bits;
+ uint32_t differing_bits = (from & common_bits) ^ bits;
common_bits ^= differing_bits;
bits &= common_bits;
}
@@ -2673,6 +2741,7 @@
if (compiler->ignore_case()) {
bound_checked = EmitAtomNonLetter(assembler,
quarks[j],
+ ascii,
backtrack,
cp_offset + j,
*checked_up_to < cp_offset + j,
« no previous file with comments | « no previous file | src/objects.h » ('j') | src/objects.h » ('J')

Powered by Google App Engine
This is Rietveld 408576698