| Index: src/jsregexp.cc
|
| diff --git a/src/jsregexp.cc b/src/jsregexp.cc
|
| index e284e8cb15f3233eee2533bc9efda8b6735bd1e3..6de8596071544929e7300a8c05c8f1069498b48c 100644
|
| --- a/src/jsregexp.cc
|
| +++ b/src/jsregexp.cc
|
| @@ -1566,7 +1566,7 @@ void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
|
|
|
|
|
| // Returns the number of characters in the equivalence class, omitting those
|
| -// that cannot occur in the source string because it is ASCII.
|
| +// that cannot occur in the source string because it is Latin1.
|
| static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
|
| bool one_byte_subject,
|
| unibrow::uchar* letters) {
|
| @@ -1578,15 +1578,18 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
|
| letters[0] = character;
|
| length = 1;
|
| }
|
| - if (!one_byte_subject || character <= String::kMaxOneByteCharCode) {
|
| - return length;
|
| +
|
| + if (one_byte_subject) {
|
| + int new_length = 0;
|
| + for (int i = 0; i < length; i++) {
|
| + if (letters[i] <= String::kMaxOneByteCharCode) {
|
| + letters[new_length++] = letters[i];
|
| + }
|
| + }
|
| + length = new_length;
|
| }
|
|
|
| - // The standard requires that non-ASCII characters cannot have ASCII
|
| - // character codes in their equivalence class.
|
| - // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore,
|
| - // is it? For example, \u00C5 is equivalent to \u212B.
|
| - return 0;
|
| + return length;
|
| }
|
|
|
|
|
| @@ -2525,22 +2528,17 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
|
| QuickCheckDetails::Position* pos =
|
| details->positions(characters_filled_in);
|
| uc16 c = quarks[i];
|
| - if (c > char_mask) {
|
| - // If we expect a non-Latin1 character from an one-byte string,
|
| - // there is no way we can match. Not even case-independent
|
| - // matching can turn an Latin1 character into non-Latin1 or
|
| - // vice versa.
|
| - // TODO(dcarney): issue 3550. Verify that this works as expected.
|
| - // For example, \u0178 is uppercase of \u00ff (y-umlaut).
|
| - details->set_cannot_match();
|
| - pos->determines_perfectly = false;
|
| - return;
|
| - }
|
| if (compiler->ignore_case()) {
|
| unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
|
| int length = GetCaseIndependentLetters(isolate, c,
|
| compiler->one_byte(), chars);
|
| - DCHECK(length != 0); // Can only happen if c > char_mask (see above).
|
| + if (length == 0) {
|
| + // This can happen because all case variants are non-Latin1, but we
|
| + // know the input is Latin1.
|
| + details->set_cannot_match();
|
| + pos->determines_perfectly = false;
|
| + return;
|
| + }
|
| if (length == 1) {
|
| // This letter has no case equivalents, so it's nice and simple
|
| // and the mask-compare will determine definitely whether we have
|
| @@ -2571,6 +2569,11 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
|
| // Don't ignore case. Nice simple case where the mask-compare will
|
| // determine definitely whether we have a match at this character
|
| // position.
|
| + if (c > char_mask) {
|
| + details->set_cannot_match();
|
| + pos->determines_perfectly = false;
|
| + return;
|
| + }
|
| pos->mask = char_mask;
|
| pos->value = c;
|
| pos->determines_perfectly = true;
|
|
|