Index: src/jsregexp.cc |
diff --git a/src/jsregexp.cc b/src/jsregexp.cc |
index 04d194419fd852eed9673c72c322bcec3ed0f0da..83aa4a516633d11ddcf9a93cd770eeaa6ddd3c94 100644 |
--- a/src/jsregexp.cc |
+++ b/src/jsregexp.cc |
@@ -1431,14 +1431,6 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, |
int cp_offset, |
bool check_offset, |
bool preloaded) { |
- if (cc->is_standard() && |
- macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), |
- cp_offset, |
- check_offset, |
- on_failure)) { |
- return; |
- } |
- |
ZoneList<CharacterRange>* ranges = cc->ranges(); |
int max_char; |
if (ascii) { |
@@ -1489,6 +1481,12 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, |
macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); |
} |
+ if (cc->is_standard() && |
+ macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), |
+ on_failure)) { |
+ return; |
+ } |
+ |
for (int i = 0; i < last_valid_range; i++) { |
CharacterRange& range = ranges->at(i); |
Label next_range; |
@@ -1626,8 +1624,8 @@ int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) { |
} |
-int NegativeLookaheadChoiceNode:: EatsAtLeast(int still_to_find, |
- int recursion_depth) { |
+int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find, |
+ int recursion_depth) { |
if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
// Alternative 0 is the negative lookahead, alternative 1 is what comes |
// afterwards. |
@@ -2049,6 +2047,12 @@ static void EmitWordCheck(RegExpMacroAssembler* assembler, |
Label* word, |
Label* non_word, |
bool fall_through_on_word) { |
+ if (assembler->CheckSpecialCharacterClass( |
+ fall_through_on_word ? 'w' : 'W', |
+ fall_through_on_word ? non_word : word)) { |
+ // Optimized implementation available. |
+ return; |
+ } |
assembler->CheckCharacterGT('z', non_word); |
assembler->CheckCharacterLT('0', non_word); |
assembler->CheckCharacterGT('a' - 1, word); |
@@ -2085,17 +2089,60 @@ static void EmitHat(RegExpCompiler* compiler, |
assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, |
new_trace.backtrack(), |
false); |
- // Newline means \n, \r, 0x2028 or 0x2029. |
- if (!compiler->ascii()) { |
- assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); |
+ if (!assembler->CheckSpecialCharacterClass('n', |
+ new_trace.backtrack())) { |
+ // Newline means \n, \r, 0x2028 or 0x2029. |
+ if (!compiler->ascii()) { |
+ assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); |
+ } |
+ assembler->CheckCharacter('\n', &ok); |
+ assembler->CheckNotCharacter('\r', new_trace.backtrack()); |
} |
- assembler->CheckCharacter('\n', &ok); |
- assembler->CheckNotCharacter('\r', new_trace.backtrack()); |
assembler->Bind(&ok); |
on_success->Emit(compiler, &new_trace); |
} |
+// Emit the code to handle \b and \B (word-boundary or non-word-boundary) |
+// when we know whether the next character must be a word character or not. |
+static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type, |
+ RegExpCompiler* compiler, |
+ RegExpNode* on_success, |
+ Trace* trace) { |
+ RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
+ Label done; |
+ |
+ Trace new_trace(*trace); |
+ |
+ bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER); |
+ Label* on_word = expect_word_character ? &done : new_trace.backtrack(); |
+ Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done; |
+ |
+ // Check whether previous character was a word character. |
+ switch(trace->at_start()) { |
+ case Trace::TRUE: |
+ if (expect_word_character) { |
+ assembler->GoTo(on_non_word); |
+ } |
+ break; |
+ case Trace::UNKNOWN: |
+ ASSERT_EQ(0, trace->cp_offset()); |
+ assembler->CheckAtStart(on_non_word); |
+ // Fall through. |
+ case Trace::FALSE: |
+ int prev_char_offset = trace->cp_offset() - 1; |
+ assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1); |
+ EmitWordCheck(assembler, on_word, on_non_word, expect_word_character); |
+ // We may or may not have loaded the previous character. |
+ new_trace.InvalidateCurrentCharacter(); |
+ } |
+ |
+ assembler->Bind(&done); |
+ |
+ on_success->Emit(compiler, &new_trace); |
+} |
+ |
+ |
// Emit the code to handle \b and \B (word-boundary or non-word-boundary). |
static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type, |
RegExpCompiler* compiler, |
@@ -2205,10 +2252,15 @@ void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { |
case AFTER_NEWLINE: |
EmitHat(compiler, on_success(), trace); |
return; |
- case AT_NON_BOUNDARY: |
case AT_BOUNDARY: |
+ case AT_NON_BOUNDARY: { |
EmitBoundaryCheck(type_, compiler, on_success(), trace); |
return; |
+ } |
+ case AFTER_WORD_CHARACTER: |
+ case AFTER_NONWORD_CHARACTER: { |
+ EmitHalfBoundaryCheck(type_, compiler, on_success(), trace); |
+ } |
} |
on_success()->Emit(compiler, trace); |
} |
@@ -2791,7 +2843,7 @@ void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { |
// to generate probably can't use it. |
if (i != first_normal_choice) { |
alt_gen->expects_preload = false; |
- new_trace.set_characters_preloaded(0); |
+ new_trace.InvalidateCurrentCharacter(); |
} |
if (i < choice_count - 1) { |
new_trace.set_backtrack(&alt_gen->after); |
@@ -3282,6 +3334,12 @@ void DotPrinter::VisitAssertion(AssertionNode* that) { |
case AssertionNode::AFTER_NEWLINE: |
stream()->Add("label=\"(?<=\\n)\", shape=septagon"); |
break; |
+ case AssertionNode::AFTER_WORD_CHARACTER: |
+ stream()->Add("label=\"(?<=\\w)\", shape=septagon"); |
+ break; |
+ case AssertionNode::AFTER_NONWORD_CHARACTER: |
+ stream()->Add("label=\"(?<=\\W)\", shape=septagon"); |
+ break; |
} |
stream()->Add("];\n"); |
PrintAttributes(that); |
@@ -3484,6 +3542,20 @@ bool RegExpCharacterClass::is_standard() { |
set_.set_standard_set_type('.'); |
return true; |
} |
+ if (CompareRanges(set_.ranges(), |
+ kLineTerminatorRanges, |
+ kLineTerminatorRangeCount)) { |
+ set_.set_standard_set_type('n'); |
+ return true; |
+ } |
+ if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) { |
+ set_.set_standard_set_type('w'); |
+ return true; |
+ } |
+ if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) { |
+ set_.set_standard_set_type('W'); |
+ return true; |
+ } |
return false; |
} |
@@ -4010,6 +4082,101 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, |
} |
+bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { |
+ ASSERT_NOT_NULL(ranges); |
+ int n = ranges->length(); |
+ if (n <= 1) return true; |
+ int max = ranges->at(0).to(); |
+ for (int i = 1; i < n; i++) { |
+ CharacterRange next_range = ranges->at(i); |
+ if (next_range.from() <= max + 1) return false; |
+ max = next_range.to(); |
+ } |
+ return true; |
+} |
+ |
+SetRelation CharacterRange::WordCharacterRelation( |
+ ZoneList<CharacterRange>* range) { |
+ ASSERT(IsCanonical(range)); |
+ int i = 0; // Word character range index. |
+ int j = 0; // Argument range index. |
+ ASSERT_NE(0, kWordRangeCount); |
+ SetRelation result; |
+ if (range->length() == 0) { |
+ result.SetElementsInSecondSet(); |
+ return result; |
+ } |
+ CharacterRange argument_range = range->at(0); |
+ CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]); |
+ while (i < kWordRangeCount && j < range->length()) { |
+ // Check the two ranges for the five cases: |
+ // - no overlap. |
+ // - partial overlap (there are elements in both ranges that isn't |
+ // in the other, and there are also elements that are in both). |
+ // - argument range entirely inside word range. |
+ // - word range entirely inside argument range. |
+ // - ranges are completely equal. |
+ |
+ // First check for no overlap. The earlier range is not in the other set. |
+ if (argument_range.from() > word_range.to()) { |
+ // Ranges are disjoint. The earlier word range contains elements that |
+ // cannot be in the argument set. |
+ result.SetElementsInSecondSet(); |
+ } else if (word_range.from() > argument_range.to()) { |
+ // Ranges are disjoint. The earlier argument range contains elements that |
+ // cannot be in the word set. |
+ result.SetElementsInFirstSet(); |
+ } else if (word_range.from() <= argument_range.from() && |
+ word_range.to() >= argument_range.from()) { |
+ result.SetElementsInBothSets(); |
+ // argument range completely inside word range. |
+ if (word_range.from() < argument_range.from() || |
+ word_range.to() > argument_range.from()) { |
+ result.SetElementsInSecondSet(); |
+ } |
+ } else if (word_range.from() >= argument_range.from() && |
+ word_range.to() <= argument_range.from()) { |
+ result.SetElementsInBothSets(); |
+ result.SetElementsInFirstSet(); |
+ } else { |
+ // There is overlap, and neither is a subrange of the other |
+ result.SetElementsInFirstSet(); |
+ result.SetElementsInSecondSet(); |
+ result.SetElementsInBothSets(); |
+ } |
+ if (result.NonTrivialIntersection()) { |
+ // The result is as (im)precise as we can possibly make it. |
+ return result; |
+ } |
+ // Progress the range(s) with minimal to-character. |
+ uc16 word_to = word_range.to(); |
+ uc16 argument_to = argument_range.to(); |
+ if (argument_to <= word_to) { |
+ j++; |
+ if (j < range->length()) { |
+ argument_range = range->at(j); |
+ } |
+ } |
+ if (word_to <= argument_to) { |
+ i += 2; |
+ if (i < kWordRangeCount) { |
+ word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]); |
+ } |
+ } |
+ } |
+ // Check if anything wasn't compared in the loop. |
+ if (i < kWordRangeCount) { |
+ // word range contains something not in argument range. |
+ result.SetElementsInSecondSet(); |
+ } else if (j < range->length()){ |
+ // Argument range contains something not in word range. |
+ result.SetElementsInFirstSet(); |
+ } |
+ |
+ return result; |
+} |
+ |
+ |
static void AddUncanonicals(ZoneList<CharacterRange>* ranges, |
int bottom, |
int top) { |
@@ -4119,6 +4286,287 @@ ZoneList<CharacterRange>* CharacterSet::ranges() { |
} |
+// Move a number of elements in a zonelist to another position |
+// in the same list. Handles overlapping source and target areas. |
+static void MoveRanges(ZoneList<CharacterRange>* list, |
+ int from, |
+ int to, |
+ int count) { |
+ // Ranges are potentially overlapping. |
+ if (from < to) { |
+ for (int i = count - 1; i >= 0; i--) { |
+ list->at(to + i) = list->at(from + i); |
+ } |
+ } else { |
+ for (int i = 0; i < count; i++) { |
+ list->at(to + i) = list->at(from + i); |
+ } |
+ } |
+} |
+ |
+ |
+static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, |
+ int count, |
+ CharacterRange insert) { |
+ // Inserts a range into list[0..count[, which must be sorted |
+ // by from value and non-overlapping and non-adjacent, using at most |
+ // list[0..count] for the result. Returns the number of resulting |
+ // canonicalized ranges. Inserting a range may collapse existing ranges into |
+ // fewer ranges, so the return value can be anything in the range 1..count+1. |
+ uc16 from = insert.from(); |
+ uc16 to = insert.to(); |
+ int start_pos = 0; |
+ int end_pos = count; |
+ for (int i = count - 1; i >= 0; i--) { |
+ CharacterRange current = list->at(i); |
+ if (current.from() > to + 1) { |
+ end_pos = i; |
+ } else if (current.to() + 1 < from) { |
+ start_pos = i + 1; |
+ break; |
+ } |
+ } |
+ |
+ // Inserted range overlaps, or is adjacent to, ranges at positions |
+ // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are |
+ // not affected by the insertion. |
+ // If start_pos == end_pos, the range must be inserted before start_pos. |
+ // if start_pos < end_pos, the entire range from start_pos to end_pos |
+ // must be merged with the insert range. |
+ |
+ if (start_pos == end_pos) { |
+ // Insert between existing ranges at position start_pos. |
+ if (start_pos < count) { |
+ MoveRanges(list, start_pos, start_pos + 1, count - start_pos); |
+ } |
+ list->at(start_pos) = insert; |
+ return count + 1; |
+ } |
+ if (start_pos + 1 == end_pos) { |
+ // Replace single existing range at position start_pos. |
+ CharacterRange to_replace = list->at(start_pos); |
+ int new_from = Min(to_replace.from(), from); |
+ int new_to = Max(to_replace.to(), to); |
+ list->at(start_pos) = CharacterRange(new_from, new_to); |
+ return count; |
+ } |
+ // Replace a number of existing ranges from start_pos to end_pos - 1. |
+ // Move the remaining ranges down. |
+ |
+ int new_from = Min(list->at(start_pos).from(), from); |
+ int new_to = Max(list->at(end_pos - 1).to(), to); |
+ if (end_pos < count) { |
+ MoveRanges(list, end_pos, start_pos + 1, count - end_pos); |
+ } |
+ list->at(start_pos) = CharacterRange(new_from, new_to); |
+ return count - (end_pos - start_pos) + 1; |
+} |
+ |
+ |
+void CharacterSet::Canonicalize() { |
+ // Special/default classes are always considered canonical. The result |
+ // of calling ranges() will be sorted. |
+ if (ranges_ == NULL) return; |
+ CharacterRange::Canonicalize(ranges_); |
+} |
+ |
+ |
+void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { |
+ if (character_ranges->length() <= 1) return; |
+ // Check whether ranges are already canonical (increasing, non-overlapping, |
+ // non-adjacent). |
+ int n = character_ranges->length(); |
+ int max = character_ranges->at(0).to(); |
+ int i = 1; |
+ while (i < n) { |
+ CharacterRange current = character_ranges->at(i); |
+ if (current.from() <= max + 1) { |
+ break; |
+ } |
+ max = current.to(); |
+ i++; |
+ } |
+ // Canonical until the i'th range. If that's all of them, we are done. |
+ if (i == n) return; |
+ |
+ // The ranges at index i and forward are not canonicalized. Make them so by |
+ // doing the equivalent of insertion sort (inserting each into the previous |
+ // list, in order). |
+ // Notice that inserting a range can reduce the number of ranges in the |
+ // result due to combining of adjacent and overlapping ranges. |
+ int read = i; // Range to insert. |
+ int num_canonical = i; // Length of canonicalized part of list. |
+ do { |
+ num_canonical = InsertRangeInCanonicalList(character_ranges, |
+ num_canonical, |
+ character_ranges->at(read)); |
+ read++; |
+ } while (read < n); |
+ character_ranges->Rewind(num_canonical); |
+ |
+ ASSERT(CharacterRange::IsCanonical(character_ranges)); |
+} |
+ |
+ |
+// Utility function for CharacterRange::Merge. Adds a range at the end of |
+// a canonicalized range list, if necessary merging the range with the last |
+// range of the list. |
+static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) { |
+ if (set == NULL) return; |
+ ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from()); |
+ int n = set->length(); |
+ if (n > 0) { |
+ CharacterRange lastRange = set->at(n - 1); |
+ if (lastRange.to() == range.from() - 1) { |
+ set->at(n - 1) = CharacterRange(lastRange.from(), range.to()); |
+ return; |
+ } |
+ } |
+ set->Add(range); |
+} |
+ |
+ |
+static void AddRangeToSelectedSet(int selector, |
+ ZoneList<CharacterRange>* first_set, |
+ ZoneList<CharacterRange>* second_set, |
+ ZoneList<CharacterRange>* intersection_set, |
+ CharacterRange range) { |
+ switch (selector) { |
+ case kInsideFirst: |
+ AddRangeToSet(first_set, range); |
+ break; |
+ case kInsideSecond: |
+ AddRangeToSet(second_set, range); |
+ break; |
+ case kInsideBoth: |
+ AddRangeToSet(intersection_set, range); |
+ break; |
+ } |
+} |
+ |
+ |
+ |
+void CharacterRange::Merge(ZoneList<CharacterRange>* first_set, |
+ ZoneList<CharacterRange>* second_set, |
+ ZoneList<CharacterRange>* first_set_only_out, |
+ ZoneList<CharacterRange>* second_set_only_out, |
+ ZoneList<CharacterRange>* both_sets_out) { |
+ // Inputs are canonicalized. |
+ ASSERT(CharacterRange::IsCanonical(first_set)); |
+ ASSERT(CharacterRange::IsCanonical(second_set)); |
+ // Outputs are empty, if applicable. |
+ ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0); |
+ ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0); |
+ ASSERT(both_sets_out == NULL || both_sets_out->length() == 0); |
+ |
+ // Merge sets by iterating through the lists in order of lowest "from" value, |
+ // and putting intervals into one of three sets. |
+ |
+ if (first_set->length() == 0) { |
+ second_set_only_out->AddAll(*second_set); |
+ return; |
+ } |
+ if (second_set->length() == 0) { |
+ first_set_only_out->AddAll(*first_set); |
+ return; |
+ } |
+ // Indices into input lists. |
+ int i1 = 0; |
+ int i2 = 0; |
+ // Cache length of input lists. |
+ int n1 = first_set->length(); |
+ int n2 = second_set->length(); |
+ // Current range. May be invalid if state is kInsideNone. |
+ int from = 0; |
+ int to = -1; |
+ // Where current range comes from. |
+ int state = kInsideNone; |
+ |
+ while (i1 < n1 || i2 < n2) { |
+ CharacterRange next_range; |
+ int range_source; |
+ if (i2 == n2 || first_set->at(i1).from() < second_set->at(i2).from()) { |
+ next_range = first_set->at(i1++); |
+ range_source = kInsideFirst; |
+ } else { |
+ next_range = second_set->at(i2++); |
+ range_source = kInsideSecond; |
+ } |
+ if (to < next_range.from()) { |
+ // Ranges disjoint: |current| |next| |
+ AddRangeToSelectedSet(state, |
+ first_set_only_out, |
+ second_set_only_out, |
+ both_sets_out, |
+ CharacterRange(from, to)); |
+ from = next_range.from(); |
+ to = next_range.to(); |
+ state = range_source; |
+ } else { |
+ if (from < next_range.from()) { |
+ AddRangeToSelectedSet(state, |
+ first_set_only_out, |
+ second_set_only_out, |
+ both_sets_out, |
+ CharacterRange(from, next_range.from()-1)); |
+ } |
+ if (to < next_range.to()) { |
+ // Ranges overlap: |current| |
+ // |next| |
+ AddRangeToSelectedSet(state | range_source, |
+ first_set_only_out, |
+ second_set_only_out, |
+ both_sets_out, |
+ CharacterRange(next_range.from(), to)); |
+ from = to + 1; |
+ to = next_range.to(); |
+ state = range_source; |
+ } else { |
+ // Range included: |current| , possibly ending at same character. |
+ // |next| |
+ AddRangeToSelectedSet( |
+ state | range_source, |
+ first_set_only_out, |
+ second_set_only_out, |
+ both_sets_out, |
+ CharacterRange(next_range.from(), next_range.to())); |
+ from = next_range.to() + 1; |
+ // If ranges end at same character, both ranges are consumed completely. |
+ if (next_range.to() == to) state = kInsideNone; |
+ } |
+ } |
+ } |
+ AddRangeToSelectedSet(state, |
+ first_set_only_out, |
+ second_set_only_out, |
+ both_sets_out, |
+ CharacterRange(from, to)); |
+} |
+ |
+ |
+void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, |
+ ZoneList<CharacterRange>* negated_ranges) { |
+ ASSERT(CharacterRange::IsCanonical(ranges)); |
+ ASSERT_EQ(0, negated_ranges->length()); |
+ int range_count = ranges->length(); |
+ uc16 from = 0; |
+ int i = 0; |
+ if (range_count > 0 && ranges->at(0).from() == 0) { |
+ from = ranges->at(0).to(); |
+ i = 1; |
+ } |
+ while (i < range_count) { |
+ CharacterRange range = ranges->at(i); |
+ negated_ranges->Add(CharacterRange(from + 1, range.from() - 1)); |
+ from = range.to(); |
+ i++; |
+ } |
+ if (from < String::kMaxUC16CharCode) { |
+ negated_ranges->Add(CharacterRange(from + 1, String::kMaxUC16CharCode)); |
+ } |
+} |
+ |
+ |
// ------------------------------------------------------------------- |
// Interest propagation |
@@ -4410,9 +4858,203 @@ void Analysis::VisitBackReference(BackReferenceNode* that) { |
void Analysis::VisitAssertion(AssertionNode* that) { |
EnsureAnalyzed(that->on_success()); |
+ AssertionNode::AssertionNodeType type = that->type(); |
+ if (type == AssertionNode::AT_BOUNDARY || |
+ type == AssertionNode::AT_NON_BOUNDARY) { |
+ // Check if the following character is known to be a word character |
+ // or known to not be a word character. |
+ ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet(); |
+ |
+ CharacterRange::Canonicalize(following_chars); |
+ |
+ SetRelation word_relation = |
+ CharacterRange::WordCharacterRelation(following_chars); |
+ if (word_relation.ContainedIn()) { |
+ // Following character is definitely a word character. |
+ type = (type == AssertionNode::AT_BOUNDARY) ? |
+ AssertionNode::AFTER_NONWORD_CHARACTER : |
+ AssertionNode::AFTER_WORD_CHARACTER; |
+ that->set_type(type); |
+ } else if (word_relation.Disjoint()) { |
+ // Following character is definitely *not* a word character. |
+ type = (type == AssertionNode::AT_BOUNDARY) ? |
+ AssertionNode::AFTER_WORD_CHARACTER : |
+ AssertionNode::AFTER_NONWORD_CHARACTER; |
+ that->set_type(type); |
+ } |
+ } |
} |
+ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() { |
+ if (first_character_set_ == NULL) { |
+ if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) { |
+ // If we can't find an exact solution within the budget, we |
+ // set the value to the set of every character, i.e., all characters |
+ // are possible. |
+ ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1); |
+ all_set->Add(CharacterRange::Everything()); |
+ first_character_set_ = all_set; |
+ } |
+ } |
+ return first_character_set_; |
+} |
+ |
+ |
+int RegExpNode::ComputeFirstCharacterSet(int budget) { |
+ // Default behavior is to not be able to determine the first character. |
+ return kComputeFirstCharacterSetFail; |
+} |
+ |
+ |
+int LoopChoiceNode::ComputeFirstCharacterSet(int budget) { |
+ budget--; |
+ if (budget >= 0) { |
+ // Find loop min-iteration. It's the value of the guarded choice node |
+ // with a GEQ guard, if any. |
+ int min_repetition = 0; |
+ |
+ for (int i = 0; i <= 1; i++) { |
+ GuardedAlternative alternative = alternatives()->at(i); |
+ ZoneList<Guard*>* guards = alternative.guards(); |
+ if (guards != NULL && guards->length() > 0) { |
+ Guard* guard = guards->at(0); |
+ if (guard->op() == Guard::GEQ) { |
+ min_repetition = guard->value(); |
+ break; |
+ } |
+ } |
+ } |
+ |
+ budget = loop_node()->ComputeFirstCharacterSet(budget); |
+ if (budget >= 0) { |
+ ZoneList<CharacterRange>* character_set = |
+ loop_node()->first_character_set(); |
+ if (body_can_be_zero_length() || min_repetition == 0) { |
+ budget = continue_node()->ComputeFirstCharacterSet(budget); |
+ if (budget < 0) return budget; |
+ ZoneList<CharacterRange>* body_set = |
+ continue_node()->first_character_set(); |
+ ZoneList<CharacterRange>* union_set = |
+ new ZoneList<CharacterRange>(Max(character_set->length(), |
+ body_set->length())); |
+ CharacterRange::Merge(character_set, |
+ body_set, |
+ union_set, |
+ union_set, |
+ union_set); |
+ character_set = union_set; |
+ } |
+ set_first_character_set(character_set); |
+ } |
+ } |
+ return budget; |
+} |
+ |
+ |
+int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) { |
+ budget--; |
+ if (budget >= 0) { |
+ GuardedAlternative successor = this->alternatives()->at(1); |
+ RegExpNode* successor_node = successor.node(); |
+ budget = successor_node->ComputeFirstCharacterSet(budget); |
+ if (budget >= 0) { |
+ set_first_character_set(successor_node->first_character_set()); |
+ } |
+ } |
+ return budget; |
+} |
+ |
+ |
+// The first character set of an EndNode is unknowable. Just use the |
+// default implementation that fails and returns all characters as possible. |
+ |
+ |
+int AssertionNode::ComputeFirstCharacterSet(int budget) { |
+ budget -= 1; |
+ if (budget >= 0) { |
+ switch (type_) { |
+ case AT_END: { |
+ set_first_character_set(new ZoneList<CharacterRange>(0)); |
+ break; |
+ } |
+ case AT_START: |
+ case AT_BOUNDARY: |
+ case AT_NON_BOUNDARY: |
+ case AFTER_NEWLINE: |
+ case AFTER_NONWORD_CHARACTER: |
+ case AFTER_WORD_CHARACTER: { |
+ ASSERT_NOT_NULL(on_success()); |
+ budget = on_success()->ComputeFirstCharacterSet(budget); |
+ set_first_character_set(on_success()->first_character_set()); |
+ break; |
+ } |
+ } |
+ } |
+ return budget; |
+} |
+ |
+ |
+int ActionNode::ComputeFirstCharacterSet(int budget) { |
+ if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail; |
+ budget--; |
+ if (budget >= 0) { |
+ ASSERT_NOT_NULL(on_success()); |
+ budget = on_success()->ComputeFirstCharacterSet(budget); |
+ if (budget >= 0) { |
+ set_first_character_set(on_success()->first_character_set()); |
+ } |
+ } |
+ return budget; |
+} |
+ |
+ |
+int BackReferenceNode::ComputeFirstCharacterSet(int budget) { |
+ // We don't know anything about the first character of a backreference |
+ // at this point. |
+ return kComputeFirstCharacterSetFail; |
+} |
+ |
+ |
+int TextNode::ComputeFirstCharacterSet(int budget) { |
+ budget--; |
+ if (budget >= 0) { |
+ ASSERT_NE(0, elements()->length()); |
+ TextElement text = elements()->at(0); |
+ if (text.type == TextElement::ATOM) { |
+ RegExpAtom* atom = text.data.u_atom; |
+ ASSERT_NE(0, atom->length()); |
+ uc16 first_char = atom->data()[0]; |
+ ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1); |
+ range->Add(CharacterRange(first_char, first_char)); |
+ set_first_character_set(range); |
+ } else { |
+ ASSERT(text.type == TextElement::CHAR_CLASS); |
+ RegExpCharacterClass* char_class = text.data.u_char_class; |
+ if (char_class->is_negated()) { |
+ ZoneList<CharacterRange>* ranges = char_class->ranges(); |
+ int length = ranges->length(); |
+ int new_length = length + 1; |
+ if (length > 0) { |
+ if (ranges->at(0).from() == 0) new_length--; |
+ if (ranges->at(length - 1).to() == String::kMaxUC16CharCode) { |
+ new_length--; |
+ } |
+ } |
+ ZoneList<CharacterRange>* negated_ranges = |
+ new ZoneList<CharacterRange>(new_length); |
+ CharacterRange::Negate(ranges, negated_ranges); |
+ set_first_character_set(negated_ranges); |
+ } else { |
+ set_first_character_set(char_class->ranges()); |
+ } |
+ } |
+ } |
+ return budget; |
+} |
+ |
+ |
+ |
// ------------------------------------------------------------------- |
// Dispatch table construction |
@@ -4471,7 +5113,6 @@ void DispatchTableConstructor::VisitAssertion(AssertionNode* that) { |
} |
- |
static int CompareRangeByFrom(const CharacterRange* a, |
const CharacterRange* b) { |
return Compare<uc16>(a->from(), b->from()); |