| Index: src/jsregexp.cc
|
| diff --git a/src/jsregexp.cc b/src/jsregexp.cc
|
| index 813208c9590e651749321a818351488629da4f1f..347fc03e7b26d96751bc0e270296bf828a0a41e3 100644
|
| --- a/src/jsregexp.cc
|
| +++ b/src/jsregexp.cc
|
| @@ -1681,7 +1681,7 @@ static int GetCaseIndependentLetters(Isolate* isolate,
|
| letters[0] = character;
|
| length = 1;
|
| }
|
| - if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
|
| + if (!ascii_subject || character <= String::kMaxOneByteCharCode) {
|
| return length;
|
| }
|
| // The standard requires that non-ASCII characters cannot have ASCII
|
| @@ -1732,7 +1732,7 @@ static inline bool EmitAtomNonLetter(Isolate* isolate,
|
| bool checked = false;
|
| // We handle the length > 1 case in a later pass.
|
| if (length == 1) {
|
| - if (ascii && c > String::kMaxAsciiCharCodeU) {
|
| + if (ascii && c > String::kMaxOneByteCharCodeU) {
|
| // Can't match - see above.
|
| return false; // Bounds not checked.
|
| }
|
| @@ -1753,7 +1753,7 @@ static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
|
| Label* on_failure) {
|
| uc16 char_mask;
|
| if (ascii) {
|
| - char_mask = String::kMaxAsciiCharCode;
|
| + char_mask = String::kMaxOneByteCharCode;
|
| } else {
|
| char_mask = String::kMaxUtf16CodeUnit;
|
| }
|
| @@ -2007,7 +2007,7 @@ static void SplitSearchSpace(ZoneList<int>* ranges,
|
| // range with a single not-taken branch, speeding up this important
|
| // character range (even non-ASCII charset-based text has spaces and
|
| // punctuation).
|
| - if (*border - 1 > String::kMaxAsciiCharCode && // ASCII case.
|
| + if (*border - 1 > String::kMaxOneByteCharCode && // ASCII case.
|
| end_index - start_index > (*new_start_index - start_index) * 2 &&
|
| last - first > kSize * 2 &&
|
| binary_chop_index > *new_start_index &&
|
| @@ -2211,7 +2211,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
|
|
|
| int max_char;
|
| if (ascii) {
|
| - max_char = String::kMaxAsciiCharCode;
|
| + max_char = String::kMaxOneByteCharCode;
|
| } else {
|
| max_char = String::kMaxUtf16CodeUnit;
|
| }
|
| @@ -2513,7 +2513,7 @@ bool QuickCheckDetails::Rationalize(bool asc) {
|
| bool found_useful_op = false;
|
| uint32_t char_mask;
|
| if (asc) {
|
| - char_mask = String::kMaxAsciiCharCode;
|
| + char_mask = String::kMaxOneByteCharCode;
|
| } else {
|
| char_mask = String::kMaxUtf16CodeUnit;
|
| }
|
| @@ -2522,7 +2522,7 @@ bool QuickCheckDetails::Rationalize(bool asc) {
|
| int char_shift = 0;
|
| for (int i = 0; i < characters_; i++) {
|
| Position* pos = &positions_[i];
|
| - if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
|
| + if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
|
| found_useful_op = true;
|
| }
|
| mask_ |= (pos->mask & char_mask) << char_shift;
|
| @@ -2565,7 +2565,7 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
|
| // load so the value is already masked down.
|
| uint32_t char_mask;
|
| if (compiler->ascii()) {
|
| - char_mask = String::kMaxAsciiCharCode;
|
| + char_mask = String::kMaxOneByteCharCode;
|
| } else {
|
| char_mask = String::kMaxUtf16CodeUnit;
|
| }
|
| @@ -2575,7 +2575,11 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
|
| // For 2-character preloads in ASCII mode or 1-character preloads in
|
| // TWO_BYTE mode we also use a 16 bit load with zero extend.
|
| if (details->characters() == 2 && compiler->ascii()) {
|
| - if ((mask & 0x7f7f) == 0x7f7f) need_mask = false;
|
| +#ifndef ENABLE_LATIN_1
|
| + if ((mask & 0x7f7f) == 0xffff) need_mask = false;
|
| +#else
|
| + if ((mask & 0xffff) == 0xffff) need_mask = false;
|
| +#endif
|
| } else if (details->characters() == 1 && !compiler->ascii()) {
|
| if ((mask & 0xffff) == 0xffff) need_mask = false;
|
| } else {
|
| @@ -2617,7 +2621,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
|
| int characters = details->characters();
|
| int char_mask;
|
| if (compiler->ascii()) {
|
| - char_mask = String::kMaxAsciiCharCode;
|
| + char_mask = String::kMaxOneByteCharCode;
|
| } else {
|
| char_mask = String::kMaxUtf16CodeUnit;
|
| }
|
| @@ -2834,24 +2838,24 @@ class VisitMarker {
|
| };
|
|
|
|
|
| -RegExpNode* SeqRegExpNode::FilterASCII(int depth) {
|
| +RegExpNode* SeqRegExpNode::FilterASCII(int depth, bool ignore_case) {
|
| if (info()->replacement_calculated) return replacement();
|
| if (depth < 0) return this;
|
| ASSERT(!info()->visited);
|
| VisitMarker marker(info());
|
| - return FilterSuccessor(depth - 1);
|
| + return FilterSuccessor(depth - 1, ignore_case);
|
| }
|
|
|
|
|
| -RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
|
| - RegExpNode* next = on_success_->FilterASCII(depth - 1);
|
| +RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
|
| + RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case);
|
| if (next == NULL) return set_replacement(NULL);
|
| on_success_ = next;
|
| return set_replacement(this);
|
| }
|
|
|
|
|
| -RegExpNode* TextNode::FilterASCII(int depth) {
|
| +RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
|
| if (info()->replacement_calculated) return replacement();
|
| if (depth < 0) return this;
|
| ASSERT(!info()->visited);
|
| @@ -2862,15 +2866,40 @@ RegExpNode* TextNode::FilterASCII(int depth) {
|
| if (elm.type == TextElement::ATOM) {
|
| Vector<const uc16> quarks = elm.data.u_atom->data();
|
| for (int j = 0; j < quarks.length(); j++) {
|
| - // We don't need special handling for case independence
|
| - // because of the rule that case independence cannot make
|
| - // a non-ASCII character match an ASCII character.
|
| - if (quarks[j] > String::kMaxAsciiCharCode) {
|
| +#ifndef ENABLE_LATIN_1
|
| + if (quarks[j] > String::kMaxOneByteCharCode) {
|
| return set_replacement(NULL);
|
| }
|
| +#else
|
| + if (quarks[j] <= String::kMaxOneByteCharCode) continue;
|
| + if (!ignore_case) return set_replacement(NULL);
|
| + // Here, we need to check for characters whose upper and lower cases
|
| + // are outside the Latin-1 range.
|
| + // TODO(dcarney): Replace this code with a simple
|
| + // table lookup in unibrow::Latin-1.
|
| + // TODO(dcarney): Test cases!.
|
| + unibrow::uchar result;
|
| + int chars;
|
| + chars = unibrow::ToLowercase::Convert(quarks[j], 0, &result, NULL);
|
| + if (chars > 1 ||
|
| + (chars == 1 && result <= String::kMaxOneByteCharCodeU)) {
|
| + continue;
|
| + }
|
| + chars = unibrow::ToUppercase::Convert(quarks[j], 0, &result, NULL);
|
| + if (chars > 1 ||
|
| + (chars == 1 && result <= String::kMaxOneByteCharCodeU)) {
|
| + continue;
|
| + }
|
| + // This character is definitely not in the Latin-1 range.
|
| + return set_replacement(NULL);
|
| +#endif
|
| }
|
| } else {
|
| ASSERT(elm.type == TextElement::CHAR_CLASS);
|
| +#ifdef ENABLE_LATIN_1
|
| + // TODO(dcarney): Can this be improved?
|
| + if (ignore_case) continue;
|
| +#endif
|
| RegExpCharacterClass* cc = elm.data.u_char_class;
|
| ZoneList<CharacterRange>* ranges = cc->ranges(zone());
|
| if (!CharacterRange::IsCanonical(ranges)) {
|
| @@ -2881,39 +2910,40 @@ RegExpNode* TextNode::FilterASCII(int depth) {
|
| if (cc->is_negated()) {
|
| if (range_count != 0 &&
|
| ranges->at(0).from() == 0 &&
|
| - ranges->at(0).to() >= String::kMaxAsciiCharCode) {
|
| + ranges->at(0).to() >= String::kMaxOneByteCharCode) {
|
| return set_replacement(NULL);
|
| }
|
| } else {
|
| if (range_count == 0 ||
|
| - ranges->at(0).from() > String::kMaxAsciiCharCode) {
|
| + ranges->at(0).from() > String::kMaxOneByteCharCode) {
|
| return set_replacement(NULL);
|
| }
|
| }
|
| }
|
| }
|
| - return FilterSuccessor(depth - 1);
|
| + return FilterSuccessor(depth - 1, ignore_case);
|
| }
|
|
|
|
|
| -RegExpNode* LoopChoiceNode::FilterASCII(int depth) {
|
| +RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) {
|
| if (info()->replacement_calculated) return replacement();
|
| if (depth < 0) return this;
|
| if (info()->visited) return this;
|
| {
|
| VisitMarker marker(info());
|
|
|
| - RegExpNode* continue_replacement = continue_node_->FilterASCII(depth - 1);
|
| + RegExpNode* continue_replacement =
|
| + continue_node_->FilterASCII(depth - 1, ignore_case);
|
| // If we can't continue after the loop then there is no sense in doing the
|
| // loop.
|
| if (continue_replacement == NULL) return set_replacement(NULL);
|
| }
|
|
|
| - return ChoiceNode::FilterASCII(depth - 1);
|
| + return ChoiceNode::FilterASCII(depth - 1, ignore_case);
|
| }
|
|
|
|
|
| -RegExpNode* ChoiceNode::FilterASCII(int depth) {
|
| +RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) {
|
| if (info()->replacement_calculated) return replacement();
|
| if (depth < 0) return this;
|
| if (info()->visited) return this;
|
| @@ -2932,7 +2962,8 @@ RegExpNode* ChoiceNode::FilterASCII(int depth) {
|
| RegExpNode* survivor = NULL;
|
| for (int i = 0; i < choice_count; i++) {
|
| GuardedAlternative alternative = alternatives_->at(i);
|
| - RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1);
|
| + RegExpNode* replacement =
|
| + alternative.node()->FilterASCII(depth - 1, ignore_case);
|
| ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
|
| if (replacement != NULL) {
|
| alternatives_->at(i).set_node(replacement);
|
| @@ -2952,7 +2983,7 @@ RegExpNode* ChoiceNode::FilterASCII(int depth) {
|
| new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
|
| for (int i = 0; i < choice_count; i++) {
|
| RegExpNode* replacement =
|
| - alternatives_->at(i).node()->FilterASCII(depth - 1);
|
| + alternatives_->at(i).node()->FilterASCII(depth - 1, ignore_case);
|
| if (replacement != NULL) {
|
| alternatives_->at(i).set_node(replacement);
|
| new_alternatives->Add(alternatives_->at(i), zone());
|
| @@ -2963,7 +2994,8 @@ RegExpNode* ChoiceNode::FilterASCII(int depth) {
|
| }
|
|
|
|
|
| -RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) {
|
| +RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth,
|
| + bool ignore_case) {
|
| if (info()->replacement_calculated) return replacement();
|
| if (depth < 0) return this;
|
| if (info()->visited) return this;
|
| @@ -2971,12 +3003,12 @@ RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) {
|
| // Alternative 0 is the negative lookahead, alternative 1 is what comes
|
| // afterwards.
|
| RegExpNode* node = alternatives_->at(1).node();
|
| - RegExpNode* replacement = node->FilterASCII(depth - 1);
|
| + RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case);
|
| if (replacement == NULL) return set_replacement(NULL);
|
| alternatives_->at(1).set_node(replacement);
|
|
|
| RegExpNode* neg_node = alternatives_->at(0).node();
|
| - RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1);
|
| + RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case);
|
| // If the negative lookahead is always going to fail then
|
| // we don't need to check it.
|
| if (neg_replacement == NULL) return set_replacement(replacement);
|
| @@ -3299,7 +3331,7 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
|
| switch (pass) {
|
| case NON_ASCII_MATCH:
|
| ASSERT(ascii);
|
| - if (quarks[j] > String::kMaxAsciiCharCode) {
|
| + if (quarks[j] > String::kMaxOneByteCharCode) {
|
| assembler->GoTo(backtrack);
|
| return;
|
| }
|
| @@ -3498,7 +3530,7 @@ RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
|
| if (ranges->length() != 1) return NULL;
|
| uint32_t max_char;
|
| if (compiler->ascii()) {
|
| - max_char = String::kMaxAsciiCharCode;
|
| + max_char = String::kMaxOneByteCharCode;
|
| } else {
|
| max_char = String::kMaxUtf16CodeUnit;
|
| }
|
| @@ -3698,7 +3730,7 @@ BoyerMooreLookahead::BoyerMooreLookahead(
|
| : length_(length),
|
| compiler_(compiler) {
|
| if (compiler->ascii()) {
|
| - max_char_ = String::kMaxAsciiCharCode;
|
| + max_char_ = String::kMaxOneByteCharCode;
|
| } else {
|
| max_char_ = String::kMaxUtf16CodeUnit;
|
| }
|
| @@ -5337,8 +5369,8 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
|
| uc16 bottom = from();
|
| uc16 top = to();
|
| if (is_ascii) {
|
| - if (bottom > String::kMaxAsciiCharCode) return;
|
| - if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;
|
| + if (bottom > String::kMaxOneByteCharCode) return;
|
| + if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
|
| }
|
| unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
|
| if (top == bottom) {
|
| @@ -5885,7 +5917,7 @@ void TextNode::FillInBMInfo(int initial_offset,
|
| int length = GetCaseIndependentLetters(
|
| ISOLATE,
|
| character,
|
| - bm->max_char() == String::kMaxAsciiCharCode,
|
| + bm->max_char() == String::kMaxOneByteCharCode,
|
| chars);
|
| for (int j = 0; j < length; j++) {
|
| bm->Set(offset, chars[j]);
|
| @@ -6099,10 +6131,12 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
|
| }
|
| }
|
| if (is_ascii) {
|
| - node = node->FilterASCII(RegExpCompiler::kMaxRecursion);
|
| + node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
|
| // Do it again to propagate the new nodes to places where they were not
|
| // put because they had not been calculated yet.
|
| - if (node != NULL) node = node->FilterASCII(RegExpCompiler::kMaxRecursion);
|
| + if (node != NULL) {
|
| + node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
|
| + }
|
| }
|
|
|
| if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
|
|
|