src/jsregexp.cc - Issue 11759008: Introduce ENABLE_LATIN_1 compile flag

Unified Diff: src/jsregexp.cc

Issue 11759008: Introduce ENABLE_LATIN_1 compile flag (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Patch Set: Fix FilterASCII Created 7 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/jsregexp.cc

diff --git a/src/jsregexp.cc b/src/jsregexp.cc

index 813208c9590e651749321a818351488629da4f1f..347fc03e7b26d96751bc0e270296bf828a0a41e3 100644

--- a/src/jsregexp.cc

+++ b/src/jsregexp.cc

@@ -1681,7 +1681,7 @@ static int GetCaseIndependentLetters(Isolate* isolate,

letters[0] = character;

length = 1;

}

- if (!ascii_subject || character <= String::kMaxAsciiCharCode) {

+ if (!ascii_subject || character <= String::kMaxOneByteCharCode) {

return length;

}

// The standard requires that non-ASCII characters cannot have ASCII

@@ -1732,7 +1732,7 @@ static inline bool EmitAtomNonLetter(Isolate* isolate,

bool checked = false;

// We handle the length > 1 case in a later pass.

if (length == 1) {

- if (ascii && c > String::kMaxAsciiCharCodeU) {

+ if (ascii && c > String::kMaxOneByteCharCodeU) {

// Can't match - see above.

return false; // Bounds not checked.

}

@@ -1753,7 +1753,7 @@ static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,

Label* on_failure) {

uc16 char_mask;

if (ascii) {

- char_mask = String::kMaxAsciiCharCode;

+ char_mask = String::kMaxOneByteCharCode;

} else {

char_mask = String::kMaxUtf16CodeUnit;

}

@@ -2007,7 +2007,7 @@ static void SplitSearchSpace(ZoneList<int>* ranges,

// range with a single not-taken branch, speeding up this important

// character range (even non-ASCII charset-based text has spaces and

// punctuation).

- if (*border - 1 > String::kMaxAsciiCharCode && // ASCII case.

+ if (*border - 1 > String::kMaxOneByteCharCode && // ASCII case.

end_index - start_index > (*new_start_index - start_index) * 2 &&

last - first > kSize * 2 &&

binary_chop_index > *new_start_index &&

@@ -2211,7 +2211,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,

int max_char;

if (ascii) {

- max_char = String::kMaxAsciiCharCode;

+ max_char = String::kMaxOneByteCharCode;

} else {

max_char = String::kMaxUtf16CodeUnit;

}

@@ -2513,7 +2513,7 @@ bool QuickCheckDetails::Rationalize(bool asc) {

bool found_useful_op = false;

uint32_t char_mask;

if (asc) {

- char_mask = String::kMaxAsciiCharCode;

+ char_mask = String::kMaxOneByteCharCode;

} else {

char_mask = String::kMaxUtf16CodeUnit;

}

@@ -2522,7 +2522,7 @@ bool QuickCheckDetails::Rationalize(bool asc) {

int char_shift = 0;

for (int i = 0; i < characters_; i++) {

Position* pos = &positions_[i];

- if ((pos->mask & String::kMaxAsciiCharCode) != 0) {

+ if ((pos->mask & String::kMaxOneByteCharCode) != 0) {

found_useful_op = true;

}

mask_ |= (pos->mask & char_mask) << char_shift;

@@ -2565,7 +2565,7 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,

// load so the value is already masked down.

uint32_t char_mask;

if (compiler->ascii()) {

- char_mask = String::kMaxAsciiCharCode;

+ char_mask = String::kMaxOneByteCharCode;

} else {

char_mask = String::kMaxUtf16CodeUnit;

}

@@ -2575,7 +2575,11 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,

// For 2-character preloads in ASCII mode or 1-character preloads in

// TWO_BYTE mode we also use a 16 bit load with zero extend.

if (details->characters() == 2 && compiler->ascii()) {

- if ((mask & 0x7f7f) == 0x7f7f) need_mask = false;

+#ifndef ENABLE_LATIN_1

+ if ((mask & 0x7f7f) == 0xffff) need_mask = false;

+#else

+ if ((mask & 0xffff) == 0xffff) need_mask = false;

+#endif

} else if (details->characters() == 1 && !compiler->ascii()) {

if ((mask & 0xffff) == 0xffff) need_mask = false;

} else {

@@ -2617,7 +2621,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,

int characters = details->characters();

int char_mask;

if (compiler->ascii()) {

- char_mask = String::kMaxAsciiCharCode;

+ char_mask = String::kMaxOneByteCharCode;

} else {

char_mask = String::kMaxUtf16CodeUnit;

}

@@ -2834,24 +2838,24 @@ class VisitMarker {

};

-RegExpNode* SeqRegExpNode::FilterASCII(int depth) {

+RegExpNode* SeqRegExpNode::FilterASCII(int depth, bool ignore_case) {

if (info()->replacement_calculated) return replacement();

if (depth < 0) return this;

ASSERT(!info()->visited);

VisitMarker marker(info());

- return FilterSuccessor(depth - 1);

+ return FilterSuccessor(depth - 1, ignore_case);

}

-RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {

- RegExpNode* next = on_success_->FilterASCII(depth - 1);

+RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {

+ RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case);

if (next == NULL) return set_replacement(NULL);

on_success_ = next;

return set_replacement(this);

}

-RegExpNode* TextNode::FilterASCII(int depth) {

+RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {

if (info()->replacement_calculated) return replacement();

if (depth < 0) return this;

ASSERT(!info()->visited);

@@ -2862,15 +2866,40 @@ RegExpNode* TextNode::FilterASCII(int depth) {

if (elm.type == TextElement::ATOM) {

Vector<const uc16> quarks = elm.data.u_atom->data();

for (int j = 0; j < quarks.length(); j++) {

- // We don't need special handling for case independence

- // because of the rule that case independence cannot make

- // a non-ASCII character match an ASCII character.

- if (quarks[j] > String::kMaxAsciiCharCode) {

+#ifndef ENABLE_LATIN_1

+ if (quarks[j] > String::kMaxOneByteCharCode) {

return set_replacement(NULL);

}

+#else

+ if (quarks[j] <= String::kMaxOneByteCharCode) continue;

+ if (!ignore_case) return set_replacement(NULL);

+ // Here, we need to check for characters whose upper and lower cases

+ // are outside the Latin-1 range.

+ // TODO(dcarney): Replace this code with a simple

+ // table lookup in unibrow::Latin-1.

+ // TODO(dcarney): Test cases!.

+ unibrow::uchar result;

+ int chars;

+ chars = unibrow::ToLowercase::Convert(quarks[j], 0, &result, NULL);

+ if (chars > 1 ||

+ (chars == 1 && result <= String::kMaxOneByteCharCodeU)) {

+ continue;

+ }

+ chars = unibrow::ToUppercase::Convert(quarks[j], 0, &result, NULL);

+ if (chars > 1 ||

+ (chars == 1 && result <= String::kMaxOneByteCharCodeU)) {

+ continue;

+ }

+ // This character is definitely not in the Latin-1 range.

+ return set_replacement(NULL);

+#endif

}

} else {

ASSERT(elm.type == TextElement::CHAR_CLASS);

+#ifdef ENABLE_LATIN_1

+ // TODO(dcarney): Can this be improved?

+ if (ignore_case) continue;

+#endif

RegExpCharacterClass* cc = elm.data.u_char_class;

ZoneList<CharacterRange>* ranges = cc->ranges(zone());

if (!CharacterRange::IsCanonical(ranges)) {

@@ -2881,39 +2910,40 @@ RegExpNode* TextNode::FilterASCII(int depth) {

if (cc->is_negated()) {

if (range_count != 0 &&

ranges->at(0).from() == 0 &&

- ranges->at(0).to() >= String::kMaxAsciiCharCode) {

+ ranges->at(0).to() >= String::kMaxOneByteCharCode) {

return set_replacement(NULL);

}

} else {

if (range_count == 0 ||

- ranges->at(0).from() > String::kMaxAsciiCharCode) {

+ ranges->at(0).from() > String::kMaxOneByteCharCode) {

return set_replacement(NULL);

}

- return FilterSuccessor(depth - 1);

+ return FilterSuccessor(depth - 1, ignore_case);

}

-RegExpNode* LoopChoiceNode::FilterASCII(int depth) {

+RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) {

if (info()->replacement_calculated) return replacement();

if (depth < 0) return this;

if (info()->visited) return this;

{

VisitMarker marker(info());

- RegExpNode* continue_replacement = continue_node_->FilterASCII(depth - 1);

+ RegExpNode* continue_replacement =

+ continue_node_->FilterASCII(depth - 1, ignore_case);

// If we can't continue after the loop then there is no sense in doing the

// loop.

if (continue_replacement == NULL) return set_replacement(NULL);

}

- return ChoiceNode::FilterASCII(depth - 1);

+ return ChoiceNode::FilterASCII(depth - 1, ignore_case);

}

-RegExpNode* ChoiceNode::FilterASCII(int depth) {

+RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) {

if (info()->replacement_calculated) return replacement();

if (depth < 0) return this;

if (info()->visited) return this;

@@ -2932,7 +2962,8 @@ RegExpNode* ChoiceNode::FilterASCII(int depth) {

RegExpNode* survivor = NULL;

for (int i = 0; i < choice_count; i++) {

GuardedAlternative alternative = alternatives_->at(i);

- RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1);

+ RegExpNode* replacement =

+ alternative.node()->FilterASCII(depth - 1, ignore_case);

ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.

if (replacement != NULL) {

alternatives_->at(i).set_node(replacement);

@@ -2952,7 +2983,7 @@ RegExpNode* ChoiceNode::FilterASCII(int depth) {

new(zone()) ZoneList<GuardedAlternative>(surviving, zone());

for (int i = 0; i < choice_count; i++) {

RegExpNode* replacement =

- alternatives_->at(i).node()->FilterASCII(depth - 1);

+ alternatives_->at(i).node()->FilterASCII(depth - 1, ignore_case);

if (replacement != NULL) {

alternatives_->at(i).set_node(replacement);

new_alternatives->Add(alternatives_->at(i), zone());

@@ -2963,7 +2994,8 @@ RegExpNode* ChoiceNode::FilterASCII(int depth) {

}

-RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) {

+RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth,

+ bool ignore_case) {

if (info()->replacement_calculated) return replacement();

if (depth < 0) return this;

if (info()->visited) return this;

@@ -2971,12 +3003,12 @@ RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) {

// Alternative 0 is the negative lookahead, alternative 1 is what comes

// afterwards.

RegExpNode* node = alternatives_->at(1).node();

- RegExpNode* replacement = node->FilterASCII(depth - 1);

+ RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case);

if (replacement == NULL) return set_replacement(NULL);

alternatives_->at(1).set_node(replacement);

RegExpNode* neg_node = alternatives_->at(0).node();

- RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1);

+ RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case);

// If the negative lookahead is always going to fail then

// we don't need to check it.

if (neg_replacement == NULL) return set_replacement(replacement);

@@ -3299,7 +3331,7 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,

switch (pass) {

case NON_ASCII_MATCH:

ASSERT(ascii);

- if (quarks[j] > String::kMaxAsciiCharCode) {

+ if (quarks[j] > String::kMaxOneByteCharCode) {

assembler->GoTo(backtrack);

return;

}

@@ -3498,7 +3530,7 @@ RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(

if (ranges->length() != 1) return NULL;

uint32_t max_char;

if (compiler->ascii()) {

- max_char = String::kMaxAsciiCharCode;

+ max_char = String::kMaxOneByteCharCode;

} else {

max_char = String::kMaxUtf16CodeUnit;

}

@@ -3698,7 +3730,7 @@ BoyerMooreLookahead::BoyerMooreLookahead(

: length_(length),

compiler_(compiler) {

if (compiler->ascii()) {

- max_char_ = String::kMaxAsciiCharCode;

+ max_char_ = String::kMaxOneByteCharCode;

} else {

max_char_ = String::kMaxUtf16CodeUnit;

}

@@ -5337,8 +5369,8 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,

uc16 bottom = from();

uc16 top = to();

if (is_ascii) {

- if (bottom > String::kMaxAsciiCharCode) return;

- if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;

+ if (bottom > String::kMaxOneByteCharCode) return;

+ if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;

}

unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

if (top == bottom) {

@@ -5885,7 +5917,7 @@ void TextNode::FillInBMInfo(int initial_offset,

int length = GetCaseIndependentLetters(

ISOLATE,

character,

- bm->max_char() == String::kMaxAsciiCharCode,

+ bm->max_char() == String::kMaxOneByteCharCode,

chars);

for (int j = 0; j < length; j++) {

bm->Set(offset, chars[j]);

@@ -6099,10 +6131,12 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(

}

if (is_ascii) {

- node = node->FilterASCII(RegExpCompiler::kMaxRecursion);

+ node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);

// Do it again to propagate the new nodes to places where they were not

// put because they had not been calculated yet.

- if (node != NULL) node = node->FilterASCII(RegExpCompiler::kMaxRecursion);

+ if (node != NULL) {

+ node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);

+ }

}

if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);

« no previous file with comments | « src/jsregexp.h ('k') | src/log.cc » ('j') | no next file with comments »