| Index: src/jsregexp.cc
|
| diff --git a/src/jsregexp.cc b/src/jsregexp.cc
|
| index 04d194419fd852eed9673c72c322bcec3ed0f0da..83aa4a516633d11ddcf9a93cd770eeaa6ddd3c94 100644
|
| --- a/src/jsregexp.cc
|
| +++ b/src/jsregexp.cc
|
| @@ -1431,14 +1431,6 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
|
| int cp_offset,
|
| bool check_offset,
|
| bool preloaded) {
|
| - if (cc->is_standard() &&
|
| - macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
|
| - cp_offset,
|
| - check_offset,
|
| - on_failure)) {
|
| - return;
|
| - }
|
| -
|
| ZoneList<CharacterRange>* ranges = cc->ranges();
|
| int max_char;
|
| if (ascii) {
|
| @@ -1489,6 +1481,12 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
|
| macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
|
| }
|
|
|
| + if (cc->is_standard() &&
|
| + macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
|
| + on_failure)) {
|
| + return;
|
| + }
|
| +
|
| for (int i = 0; i < last_valid_range; i++) {
|
| CharacterRange& range = ranges->at(i);
|
| Label next_range;
|
| @@ -1626,8 +1624,8 @@ int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) {
|
| }
|
|
|
|
|
| -int NegativeLookaheadChoiceNode:: EatsAtLeast(int still_to_find,
|
| - int recursion_depth) {
|
| +int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
|
| + int recursion_depth) {
|
| if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
|
| // Alternative 0 is the negative lookahead, alternative 1 is what comes
|
| // afterwards.
|
| @@ -2049,6 +2047,12 @@ static void EmitWordCheck(RegExpMacroAssembler* assembler,
|
| Label* word,
|
| Label* non_word,
|
| bool fall_through_on_word) {
|
| + if (assembler->CheckSpecialCharacterClass(
|
| + fall_through_on_word ? 'w' : 'W',
|
| + fall_through_on_word ? non_word : word)) {
|
| + // Optimized implementation available.
|
| + return;
|
| + }
|
| assembler->CheckCharacterGT('z', non_word);
|
| assembler->CheckCharacterLT('0', non_word);
|
| assembler->CheckCharacterGT('a' - 1, word);
|
| @@ -2085,17 +2089,60 @@ static void EmitHat(RegExpCompiler* compiler,
|
| assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
|
| new_trace.backtrack(),
|
| false);
|
| - // Newline means \n, \r, 0x2028 or 0x2029.
|
| - if (!compiler->ascii()) {
|
| - assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
|
| + if (!assembler->CheckSpecialCharacterClass('n',
|
| + new_trace.backtrack())) {
|
| + // Newline means \n, \r, 0x2028 or 0x2029.
|
| + if (!compiler->ascii()) {
|
| + assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
|
| + }
|
| + assembler->CheckCharacter('\n', &ok);
|
| + assembler->CheckNotCharacter('\r', new_trace.backtrack());
|
| }
|
| - assembler->CheckCharacter('\n', &ok);
|
| - assembler->CheckNotCharacter('\r', new_trace.backtrack());
|
| assembler->Bind(&ok);
|
| on_success->Emit(compiler, &new_trace);
|
| }
|
|
|
|
|
| +// Emit the code to handle \b and \B (word-boundary or non-word-boundary)
|
| +// when we know whether the next character must be a word character or not.
|
| +static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type,
|
| + RegExpCompiler* compiler,
|
| + RegExpNode* on_success,
|
| + Trace* trace) {
|
| + RegExpMacroAssembler* assembler = compiler->macro_assembler();
|
| + Label done;
|
| +
|
| + Trace new_trace(*trace);
|
| +
|
| + bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER);
|
| + Label* on_word = expect_word_character ? &done : new_trace.backtrack();
|
| + Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done;
|
| +
|
| + // Check whether previous character was a word character.
|
| + switch(trace->at_start()) {
|
| + case Trace::TRUE:
|
| + if (expect_word_character) {
|
| + assembler->GoTo(on_non_word);
|
| + }
|
| + break;
|
| + case Trace::UNKNOWN:
|
| + ASSERT_EQ(0, trace->cp_offset());
|
| + assembler->CheckAtStart(on_non_word);
|
| + // Fall through.
|
| + case Trace::FALSE:
|
| + int prev_char_offset = trace->cp_offset() - 1;
|
| + assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1);
|
| + EmitWordCheck(assembler, on_word, on_non_word, expect_word_character);
|
| + // We may or may not have loaded the previous character.
|
| + new_trace.InvalidateCurrentCharacter();
|
| + }
|
| +
|
| + assembler->Bind(&done);
|
| +
|
| + on_success->Emit(compiler, &new_trace);
|
| +}
|
| +
|
| +
|
| // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
|
| static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
|
| RegExpCompiler* compiler,
|
| @@ -2205,10 +2252,15 @@ void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
|
| case AFTER_NEWLINE:
|
| EmitHat(compiler, on_success(), trace);
|
| return;
|
| - case AT_NON_BOUNDARY:
|
| case AT_BOUNDARY:
|
| + case AT_NON_BOUNDARY: {
|
| EmitBoundaryCheck(type_, compiler, on_success(), trace);
|
| return;
|
| + }
|
| + case AFTER_WORD_CHARACTER:
|
| + case AFTER_NONWORD_CHARACTER: {
|
| + EmitHalfBoundaryCheck(type_, compiler, on_success(), trace);
|
| + }
|
| }
|
| on_success()->Emit(compiler, trace);
|
| }
|
| @@ -2791,7 +2843,7 @@ void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
|
| // to generate probably can't use it.
|
| if (i != first_normal_choice) {
|
| alt_gen->expects_preload = false;
|
| - new_trace.set_characters_preloaded(0);
|
| + new_trace.InvalidateCurrentCharacter();
|
| }
|
| if (i < choice_count - 1) {
|
| new_trace.set_backtrack(&alt_gen->after);
|
| @@ -3282,6 +3334,12 @@ void DotPrinter::VisitAssertion(AssertionNode* that) {
|
| case AssertionNode::AFTER_NEWLINE:
|
| stream()->Add("label=\"(?<=\\n)\", shape=septagon");
|
| break;
|
| + case AssertionNode::AFTER_WORD_CHARACTER:
|
| + stream()->Add("label=\"(?<=\\w)\", shape=septagon");
|
| + break;
|
| + case AssertionNode::AFTER_NONWORD_CHARACTER:
|
| + stream()->Add("label=\"(?<=\\W)\", shape=septagon");
|
| + break;
|
| }
|
| stream()->Add("];\n");
|
| PrintAttributes(that);
|
| @@ -3484,6 +3542,20 @@ bool RegExpCharacterClass::is_standard() {
|
| set_.set_standard_set_type('.');
|
| return true;
|
| }
|
| + if (CompareRanges(set_.ranges(),
|
| + kLineTerminatorRanges,
|
| + kLineTerminatorRangeCount)) {
|
| + set_.set_standard_set_type('n');
|
| + return true;
|
| + }
|
| + if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
|
| + set_.set_standard_set_type('w');
|
| + return true;
|
| + }
|
| + if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
|
| + set_.set_standard_set_type('W');
|
| + return true;
|
| + }
|
| return false;
|
| }
|
|
|
| @@ -4010,6 +4082,101 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
|
| }
|
|
|
|
|
| +bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
|
| + ASSERT_NOT_NULL(ranges);
|
| + int n = ranges->length();
|
| + if (n <= 1) return true;
|
| + int max = ranges->at(0).to();
|
| + for (int i = 1; i < n; i++) {
|
| + CharacterRange next_range = ranges->at(i);
|
| + if (next_range.from() <= max + 1) return false;
|
| + max = next_range.to();
|
| + }
|
| + return true;
|
| +}
|
| +
|
| +SetRelation CharacterRange::WordCharacterRelation(
|
| + ZoneList<CharacterRange>* range) {
|
| + ASSERT(IsCanonical(range));
|
| + int i = 0; // Word character range index.
|
| + int j = 0; // Argument range index.
|
| + ASSERT_NE(0, kWordRangeCount);
|
| + SetRelation result;
|
| + if (range->length() == 0) {
|
| + result.SetElementsInSecondSet();
|
| + return result;
|
| + }
|
| + CharacterRange argument_range = range->at(0);
|
| + CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]);
|
| + while (i < kWordRangeCount && j < range->length()) {
|
| + // Check the two ranges for the five cases:
|
| + // - no overlap.
|
| + // - partial overlap (there are elements in both ranges that isn't
|
| + // in the other, and there are also elements that are in both).
|
| + // - argument range entirely inside word range.
|
| + // - word range entirely inside argument range.
|
| + // - ranges are completely equal.
|
| +
|
| + // First check for no overlap. The earlier range is not in the other set.
|
| + if (argument_range.from() > word_range.to()) {
|
| + // Ranges are disjoint. The earlier word range contains elements that
|
| + // cannot be in the argument set.
|
| + result.SetElementsInSecondSet();
|
| + } else if (word_range.from() > argument_range.to()) {
|
| + // Ranges are disjoint. The earlier argument range contains elements that
|
| + // cannot be in the word set.
|
| + result.SetElementsInFirstSet();
|
| + } else if (word_range.from() <= argument_range.from() &&
|
| + word_range.to() >= argument_range.from()) {
|
| + result.SetElementsInBothSets();
|
| + // argument range completely inside word range.
|
| + if (word_range.from() < argument_range.from() ||
|
| + word_range.to() > argument_range.from()) {
|
| + result.SetElementsInSecondSet();
|
| + }
|
| + } else if (word_range.from() >= argument_range.from() &&
|
| + word_range.to() <= argument_range.from()) {
|
| + result.SetElementsInBothSets();
|
| + result.SetElementsInFirstSet();
|
| + } else {
|
| + // There is overlap, and neither is a subrange of the other
|
| + result.SetElementsInFirstSet();
|
| + result.SetElementsInSecondSet();
|
| + result.SetElementsInBothSets();
|
| + }
|
| + if (result.NonTrivialIntersection()) {
|
| + // The result is as (im)precise as we can possibly make it.
|
| + return result;
|
| + }
|
| + // Progress the range(s) with minimal to-character.
|
| + uc16 word_to = word_range.to();
|
| + uc16 argument_to = argument_range.to();
|
| + if (argument_to <= word_to) {
|
| + j++;
|
| + if (j < range->length()) {
|
| + argument_range = range->at(j);
|
| + }
|
| + }
|
| + if (word_to <= argument_to) {
|
| + i += 2;
|
| + if (i < kWordRangeCount) {
|
| + word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]);
|
| + }
|
| + }
|
| + }
|
| + // Check if anything wasn't compared in the loop.
|
| + if (i < kWordRangeCount) {
|
| + // word range contains something not in argument range.
|
| + result.SetElementsInSecondSet();
|
| + } else if (j < range->length()){
|
| + // Argument range contains something not in word range.
|
| + result.SetElementsInFirstSet();
|
| + }
|
| +
|
| + return result;
|
| +}
|
| +
|
| +
|
| static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
|
| int bottom,
|
| int top) {
|
| @@ -4119,6 +4286,287 @@ ZoneList<CharacterRange>* CharacterSet::ranges() {
|
| }
|
|
|
|
|
| +// Move a number of elements in a zonelist to another position
|
| +// in the same list. Handles overlapping source and target areas.
|
| +static void MoveRanges(ZoneList<CharacterRange>* list,
|
| + int from,
|
| + int to,
|
| + int count) {
|
| + // Ranges are potentially overlapping.
|
| + if (from < to) {
|
| + for (int i = count - 1; i >= 0; i--) {
|
| + list->at(to + i) = list->at(from + i);
|
| + }
|
| + } else {
|
| + for (int i = 0; i < count; i++) {
|
| + list->at(to + i) = list->at(from + i);
|
| + }
|
| + }
|
| +}
|
| +
|
| +
|
| +static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
|
| + int count,
|
| + CharacterRange insert) {
|
| + // Inserts a range into list[0..count[, which must be sorted
|
| + // by from value and non-overlapping and non-adjacent, using at most
|
| + // list[0..count] for the result. Returns the number of resulting
|
| + // canonicalized ranges. Inserting a range may collapse existing ranges into
|
| + // fewer ranges, so the return value can be anything in the range 1..count+1.
|
| + uc16 from = insert.from();
|
| + uc16 to = insert.to();
|
| + int start_pos = 0;
|
| + int end_pos = count;
|
| + for (int i = count - 1; i >= 0; i--) {
|
| + CharacterRange current = list->at(i);
|
| + if (current.from() > to + 1) {
|
| + end_pos = i;
|
| + } else if (current.to() + 1 < from) {
|
| + start_pos = i + 1;
|
| + break;
|
| + }
|
| + }
|
| +
|
| + // Inserted range overlaps, or is adjacent to, ranges at positions
|
| + // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
|
| + // not affected by the insertion.
|
| + // If start_pos == end_pos, the range must be inserted before start_pos.
|
| + // if start_pos < end_pos, the entire range from start_pos to end_pos
|
| + // must be merged with the insert range.
|
| +
|
| + if (start_pos == end_pos) {
|
| + // Insert between existing ranges at position start_pos.
|
| + if (start_pos < count) {
|
| + MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
|
| + }
|
| + list->at(start_pos) = insert;
|
| + return count + 1;
|
| + }
|
| + if (start_pos + 1 == end_pos) {
|
| + // Replace single existing range at position start_pos.
|
| + CharacterRange to_replace = list->at(start_pos);
|
| + int new_from = Min(to_replace.from(), from);
|
| + int new_to = Max(to_replace.to(), to);
|
| + list->at(start_pos) = CharacterRange(new_from, new_to);
|
| + return count;
|
| + }
|
| + // Replace a number of existing ranges from start_pos to end_pos - 1.
|
| + // Move the remaining ranges down.
|
| +
|
| + int new_from = Min(list->at(start_pos).from(), from);
|
| + int new_to = Max(list->at(end_pos - 1).to(), to);
|
| + if (end_pos < count) {
|
| + MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
|
| + }
|
| + list->at(start_pos) = CharacterRange(new_from, new_to);
|
| + return count - (end_pos - start_pos) + 1;
|
| +}
|
| +
|
| +
|
| +void CharacterSet::Canonicalize() {
|
| + // Special/default classes are always considered canonical. The result
|
| + // of calling ranges() will be sorted.
|
| + if (ranges_ == NULL) return;
|
| + CharacterRange::Canonicalize(ranges_);
|
| +}
|
| +
|
| +
|
| +void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
|
| + if (character_ranges->length() <= 1) return;
|
| + // Check whether ranges are already canonical (increasing, non-overlapping,
|
| + // non-adjacent).
|
| + int n = character_ranges->length();
|
| + int max = character_ranges->at(0).to();
|
| + int i = 1;
|
| + while (i < n) {
|
| + CharacterRange current = character_ranges->at(i);
|
| + if (current.from() <= max + 1) {
|
| + break;
|
| + }
|
| + max = current.to();
|
| + i++;
|
| + }
|
| + // Canonical until the i'th range. If that's all of them, we are done.
|
| + if (i == n) return;
|
| +
|
| + // The ranges at index i and forward are not canonicalized. Make them so by
|
| + // doing the equivalent of insertion sort (inserting each into the previous
|
| + // list, in order).
|
| + // Notice that inserting a range can reduce the number of ranges in the
|
| + // result due to combining of adjacent and overlapping ranges.
|
| + int read = i; // Range to insert.
|
| + int num_canonical = i; // Length of canonicalized part of list.
|
| + do {
|
| + num_canonical = InsertRangeInCanonicalList(character_ranges,
|
| + num_canonical,
|
| + character_ranges->at(read));
|
| + read++;
|
| + } while (read < n);
|
| + character_ranges->Rewind(num_canonical);
|
| +
|
| + ASSERT(CharacterRange::IsCanonical(character_ranges));
|
| +}
|
| +
|
| +
|
| +// Utility function for CharacterRange::Merge. Adds a range at the end of
|
| +// a canonicalized range list, if necessary merging the range with the last
|
| +// range of the list.
|
| +static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) {
|
| + if (set == NULL) return;
|
| + ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from());
|
| + int n = set->length();
|
| + if (n > 0) {
|
| + CharacterRange lastRange = set->at(n - 1);
|
| + if (lastRange.to() == range.from() - 1) {
|
| + set->at(n - 1) = CharacterRange(lastRange.from(), range.to());
|
| + return;
|
| + }
|
| + }
|
| + set->Add(range);
|
| +}
|
| +
|
| +
|
| +static void AddRangeToSelectedSet(int selector,
|
| + ZoneList<CharacterRange>* first_set,
|
| + ZoneList<CharacterRange>* second_set,
|
| + ZoneList<CharacterRange>* intersection_set,
|
| + CharacterRange range) {
|
| + switch (selector) {
|
| + case kInsideFirst:
|
| + AddRangeToSet(first_set, range);
|
| + break;
|
| + case kInsideSecond:
|
| + AddRangeToSet(second_set, range);
|
| + break;
|
| + case kInsideBoth:
|
| + AddRangeToSet(intersection_set, range);
|
| + break;
|
| + }
|
| +}
|
| +
|
| +
|
| +
|
| +void CharacterRange::Merge(ZoneList<CharacterRange>* first_set,
|
| + ZoneList<CharacterRange>* second_set,
|
| + ZoneList<CharacterRange>* first_set_only_out,
|
| + ZoneList<CharacterRange>* second_set_only_out,
|
| + ZoneList<CharacterRange>* both_sets_out) {
|
| + // Inputs are canonicalized.
|
| + ASSERT(CharacterRange::IsCanonical(first_set));
|
| + ASSERT(CharacterRange::IsCanonical(second_set));
|
| + // Outputs are empty, if applicable.
|
| + ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0);
|
| + ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0);
|
| + ASSERT(both_sets_out == NULL || both_sets_out->length() == 0);
|
| +
|
| + // Merge sets by iterating through the lists in order of lowest "from" value,
|
| + // and putting intervals into one of three sets.
|
| +
|
| + if (first_set->length() == 0) {
|
| + second_set_only_out->AddAll(*second_set);
|
| + return;
|
| + }
|
| + if (second_set->length() == 0) {
|
| + first_set_only_out->AddAll(*first_set);
|
| + return;
|
| + }
|
| + // Indices into input lists.
|
| + int i1 = 0;
|
| + int i2 = 0;
|
| + // Cache length of input lists.
|
| + int n1 = first_set->length();
|
| + int n2 = second_set->length();
|
| + // Current range. May be invalid if state is kInsideNone.
|
| + int from = 0;
|
| + int to = -1;
|
| + // Where current range comes from.
|
| + int state = kInsideNone;
|
| +
|
| + while (i1 < n1 || i2 < n2) {
|
| + CharacterRange next_range;
|
| + int range_source;
|
| + if (i2 == n2 || first_set->at(i1).from() < second_set->at(i2).from()) {
|
| + next_range = first_set->at(i1++);
|
| + range_source = kInsideFirst;
|
| + } else {
|
| + next_range = second_set->at(i2++);
|
| + range_source = kInsideSecond;
|
| + }
|
| + if (to < next_range.from()) {
|
| + // Ranges disjoint: |current| |next|
|
| + AddRangeToSelectedSet(state,
|
| + first_set_only_out,
|
| + second_set_only_out,
|
| + both_sets_out,
|
| + CharacterRange(from, to));
|
| + from = next_range.from();
|
| + to = next_range.to();
|
| + state = range_source;
|
| + } else {
|
| + if (from < next_range.from()) {
|
| + AddRangeToSelectedSet(state,
|
| + first_set_only_out,
|
| + second_set_only_out,
|
| + both_sets_out,
|
| + CharacterRange(from, next_range.from()-1));
|
| + }
|
| + if (to < next_range.to()) {
|
| + // Ranges overlap: |current|
|
| + // |next|
|
| + AddRangeToSelectedSet(state | range_source,
|
| + first_set_only_out,
|
| + second_set_only_out,
|
| + both_sets_out,
|
| + CharacterRange(next_range.from(), to));
|
| + from = to + 1;
|
| + to = next_range.to();
|
| + state = range_source;
|
| + } else {
|
| + // Range included: |current| , possibly ending at same character.
|
| + // |next|
|
| + AddRangeToSelectedSet(
|
| + state | range_source,
|
| + first_set_only_out,
|
| + second_set_only_out,
|
| + both_sets_out,
|
| + CharacterRange(next_range.from(), next_range.to()));
|
| + from = next_range.to() + 1;
|
| + // If ranges end at same character, both ranges are consumed completely.
|
| + if (next_range.to() == to) state = kInsideNone;
|
| + }
|
| + }
|
| + }
|
| + AddRangeToSelectedSet(state,
|
| + first_set_only_out,
|
| + second_set_only_out,
|
| + both_sets_out,
|
| + CharacterRange(from, to));
|
| +}
|
| +
|
| +
|
| +void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
|
| + ZoneList<CharacterRange>* negated_ranges) {
|
| + ASSERT(CharacterRange::IsCanonical(ranges));
|
| + ASSERT_EQ(0, negated_ranges->length());
|
| + int range_count = ranges->length();
|
| + uc16 from = 0;
|
| + int i = 0;
|
| + if (range_count > 0 && ranges->at(0).from() == 0) {
|
| + from = ranges->at(0).to();
|
| + i = 1;
|
| + }
|
| + while (i < range_count) {
|
| + CharacterRange range = ranges->at(i);
|
| + negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
|
| + from = range.to();
|
| + i++;
|
| + }
|
| + if (from < String::kMaxUC16CharCode) {
|
| + negated_ranges->Add(CharacterRange(from + 1, String::kMaxUC16CharCode));
|
| + }
|
| +}
|
| +
|
| +
|
|
|
| // -------------------------------------------------------------------
|
| // Interest propagation
|
| @@ -4410,9 +4858,203 @@ void Analysis::VisitBackReference(BackReferenceNode* that) {
|
|
|
| void Analysis::VisitAssertion(AssertionNode* that) {
|
| EnsureAnalyzed(that->on_success());
|
| + AssertionNode::AssertionNodeType type = that->type();
|
| + if (type == AssertionNode::AT_BOUNDARY ||
|
| + type == AssertionNode::AT_NON_BOUNDARY) {
|
| + // Check if the following character is known to be a word character
|
| + // or known to not be a word character.
|
| + ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet();
|
| +
|
| + CharacterRange::Canonicalize(following_chars);
|
| +
|
| + SetRelation word_relation =
|
| + CharacterRange::WordCharacterRelation(following_chars);
|
| + if (word_relation.ContainedIn()) {
|
| + // Following character is definitely a word character.
|
| + type = (type == AssertionNode::AT_BOUNDARY) ?
|
| + AssertionNode::AFTER_NONWORD_CHARACTER :
|
| + AssertionNode::AFTER_WORD_CHARACTER;
|
| + that->set_type(type);
|
| + } else if (word_relation.Disjoint()) {
|
| + // Following character is definitely *not* a word character.
|
| + type = (type == AssertionNode::AT_BOUNDARY) ?
|
| + AssertionNode::AFTER_WORD_CHARACTER :
|
| + AssertionNode::AFTER_NONWORD_CHARACTER;
|
| + that->set_type(type);
|
| + }
|
| + }
|
| }
|
|
|
|
|
| +ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() {
|
| + if (first_character_set_ == NULL) {
|
| + if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) {
|
| + // If we can't find an exact solution within the budget, we
|
| + // set the value to the set of every character, i.e., all characters
|
| + // are possible.
|
| + ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1);
|
| + all_set->Add(CharacterRange::Everything());
|
| + first_character_set_ = all_set;
|
| + }
|
| + }
|
| + return first_character_set_;
|
| +}
|
| +
|
| +
|
| +int RegExpNode::ComputeFirstCharacterSet(int budget) {
|
| + // Default behavior is to not be able to determine the first character.
|
| + return kComputeFirstCharacterSetFail;
|
| +}
|
| +
|
| +
|
| +int LoopChoiceNode::ComputeFirstCharacterSet(int budget) {
|
| + budget--;
|
| + if (budget >= 0) {
|
| + // Find loop min-iteration. It's the value of the guarded choice node
|
| + // with a GEQ guard, if any.
|
| + int min_repetition = 0;
|
| +
|
| + for (int i = 0; i <= 1; i++) {
|
| + GuardedAlternative alternative = alternatives()->at(i);
|
| + ZoneList<Guard*>* guards = alternative.guards();
|
| + if (guards != NULL && guards->length() > 0) {
|
| + Guard* guard = guards->at(0);
|
| + if (guard->op() == Guard::GEQ) {
|
| + min_repetition = guard->value();
|
| + break;
|
| + }
|
| + }
|
| + }
|
| +
|
| + budget = loop_node()->ComputeFirstCharacterSet(budget);
|
| + if (budget >= 0) {
|
| + ZoneList<CharacterRange>* character_set =
|
| + loop_node()->first_character_set();
|
| + if (body_can_be_zero_length() || min_repetition == 0) {
|
| + budget = continue_node()->ComputeFirstCharacterSet(budget);
|
| + if (budget < 0) return budget;
|
| + ZoneList<CharacterRange>* body_set =
|
| + continue_node()->first_character_set();
|
| + ZoneList<CharacterRange>* union_set =
|
| + new ZoneList<CharacterRange>(Max(character_set->length(),
|
| + body_set->length()));
|
| + CharacterRange::Merge(character_set,
|
| + body_set,
|
| + union_set,
|
| + union_set,
|
| + union_set);
|
| + character_set = union_set;
|
| + }
|
| + set_first_character_set(character_set);
|
| + }
|
| + }
|
| + return budget;
|
| +}
|
| +
|
| +
|
| +int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) {
|
| + budget--;
|
| + if (budget >= 0) {
|
| + GuardedAlternative successor = this->alternatives()->at(1);
|
| + RegExpNode* successor_node = successor.node();
|
| + budget = successor_node->ComputeFirstCharacterSet(budget);
|
| + if (budget >= 0) {
|
| + set_first_character_set(successor_node->first_character_set());
|
| + }
|
| + }
|
| + return budget;
|
| +}
|
| +
|
| +
|
| +// The first character set of an EndNode is unknowable. Just use the
|
| +// default implementation that fails and returns all characters as possible.
|
| +
|
| +
|
| +int AssertionNode::ComputeFirstCharacterSet(int budget) {
|
| + budget -= 1;
|
| + if (budget >= 0) {
|
| + switch (type_) {
|
| + case AT_END: {
|
| + set_first_character_set(new ZoneList<CharacterRange>(0));
|
| + break;
|
| + }
|
| + case AT_START:
|
| + case AT_BOUNDARY:
|
| + case AT_NON_BOUNDARY:
|
| + case AFTER_NEWLINE:
|
| + case AFTER_NONWORD_CHARACTER:
|
| + case AFTER_WORD_CHARACTER: {
|
| + ASSERT_NOT_NULL(on_success());
|
| + budget = on_success()->ComputeFirstCharacterSet(budget);
|
| + set_first_character_set(on_success()->first_character_set());
|
| + break;
|
| + }
|
| + }
|
| + }
|
| + return budget;
|
| +}
|
| +
|
| +
|
| +int ActionNode::ComputeFirstCharacterSet(int budget) {
|
| + if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail;
|
| + budget--;
|
| + if (budget >= 0) {
|
| + ASSERT_NOT_NULL(on_success());
|
| + budget = on_success()->ComputeFirstCharacterSet(budget);
|
| + if (budget >= 0) {
|
| + set_first_character_set(on_success()->first_character_set());
|
| + }
|
| + }
|
| + return budget;
|
| +}
|
| +
|
| +
|
| +int BackReferenceNode::ComputeFirstCharacterSet(int budget) {
|
| + // We don't know anything about the first character of a backreference
|
| + // at this point.
|
| + return kComputeFirstCharacterSetFail;
|
| +}
|
| +
|
| +
|
| +int TextNode::ComputeFirstCharacterSet(int budget) {
|
| + budget--;
|
| + if (budget >= 0) {
|
| + ASSERT_NE(0, elements()->length());
|
| + TextElement text = elements()->at(0);
|
| + if (text.type == TextElement::ATOM) {
|
| + RegExpAtom* atom = text.data.u_atom;
|
| + ASSERT_NE(0, atom->length());
|
| + uc16 first_char = atom->data()[0];
|
| + ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1);
|
| + range->Add(CharacterRange(first_char, first_char));
|
| + set_first_character_set(range);
|
| + } else {
|
| + ASSERT(text.type == TextElement::CHAR_CLASS);
|
| + RegExpCharacterClass* char_class = text.data.u_char_class;
|
| + if (char_class->is_negated()) {
|
| + ZoneList<CharacterRange>* ranges = char_class->ranges();
|
| + int length = ranges->length();
|
| + int new_length = length + 1;
|
| + if (length > 0) {
|
| + if (ranges->at(0).from() == 0) new_length--;
|
| + if (ranges->at(length - 1).to() == String::kMaxUC16CharCode) {
|
| + new_length--;
|
| + }
|
| + }
|
| + ZoneList<CharacterRange>* negated_ranges =
|
| + new ZoneList<CharacterRange>(new_length);
|
| + CharacterRange::Negate(ranges, negated_ranges);
|
| + set_first_character_set(negated_ranges);
|
| + } else {
|
| + set_first_character_set(char_class->ranges());
|
| + }
|
| + }
|
| + }
|
| + return budget;
|
| +}
|
| +
|
| +
|
| +
|
| // -------------------------------------------------------------------
|
| // Dispatch table construction
|
|
|
| @@ -4471,7 +5113,6 @@ void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
|
| }
|
|
|
|
|
| -
|
| static int CompareRangeByFrom(const CharacterRange* a,
|
| const CharacterRange* b) {
|
| return Compare<uc16>(a->from(), b->from());
|
|
|