Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(219)

Side by Side Diff: src/jsregexp.cc

Issue 507051: Attempt to make \b\w+ faster. Slight performance increase on, e.g., string unpacking. (Closed)
Patch Set: Addressed review comments. Created 10 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/jsregexp.h ('k') | src/parser.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2006-2009 the V8 project authors. All rights reserved. 1 // Copyright 2006-2009 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 1413 matching lines...) Expand 10 before | Expand all | Expand 10 after
1424 } 1424 }
1425 1425
1426 1426
1427 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 1427 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1428 RegExpCharacterClass* cc, 1428 RegExpCharacterClass* cc,
1429 bool ascii, 1429 bool ascii,
1430 Label* on_failure, 1430 Label* on_failure,
1431 int cp_offset, 1431 int cp_offset,
1432 bool check_offset, 1432 bool check_offset,
1433 bool preloaded) { 1433 bool preloaded) {
1434 if (cc->is_standard() &&
1435 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1436 cp_offset,
1437 check_offset,
1438 on_failure)) {
1439 return;
1440 }
1441
1442 ZoneList<CharacterRange>* ranges = cc->ranges(); 1434 ZoneList<CharacterRange>* ranges = cc->ranges();
1443 int max_char; 1435 int max_char;
1444 if (ascii) { 1436 if (ascii) {
1445 max_char = String::kMaxAsciiCharCode; 1437 max_char = String::kMaxAsciiCharCode;
1446 } else { 1438 } else {
1447 max_char = String::kMaxUC16CharCode; 1439 max_char = String::kMaxUC16CharCode;
1448 } 1440 }
1449 1441
1450 Label success; 1442 Label success;
1451 1443
(...skipping 30 matching lines...) Expand all
1482 if (check_offset) { 1474 if (check_offset) {
1483 macro_assembler->CheckPosition(cp_offset, on_failure); 1475 macro_assembler->CheckPosition(cp_offset, on_failure);
1484 } 1476 }
1485 return; 1477 return;
1486 } 1478 }
1487 1479
1488 if (!preloaded) { 1480 if (!preloaded) {
1489 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 1481 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1490 } 1482 }
1491 1483
1484 if (cc->is_standard() &&
1485 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1486 on_failure)) {
1487 return;
1488 }
1489
1492 for (int i = 0; i < last_valid_range; i++) { 1490 for (int i = 0; i < last_valid_range; i++) {
1493 CharacterRange& range = ranges->at(i); 1491 CharacterRange& range = ranges->at(i);
1494 Label next_range; 1492 Label next_range;
1495 uc16 from = range.from(); 1493 uc16 from = range.from();
1496 uc16 to = range.to(); 1494 uc16 to = range.to();
1497 if (from > max_char) { 1495 if (from > max_char) {
1498 continue; 1496 continue;
1499 } 1497 }
1500 if (to > max_char) to = max_char; 1498 if (to > max_char) to = max_char;
1501 if (to == from) { 1499 if (to == from) {
(...skipping 117 matching lines...) Expand 10 before | Expand all | Expand 10 after
1619 1617
1620 int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) { 1618 int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1621 int answer = Length(); 1619 int answer = Length();
1622 if (answer >= still_to_find) return answer; 1620 if (answer >= still_to_find) return answer;
1623 if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer; 1621 if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
1624 return answer + on_success()->EatsAtLeast(still_to_find - answer, 1622 return answer + on_success()->EatsAtLeast(still_to_find - answer,
1625 recursion_depth + 1); 1623 recursion_depth + 1);
1626 } 1624 }
1627 1625
1628 1626
1629 int NegativeLookaheadChoiceNode:: EatsAtLeast(int still_to_find, 1627 int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
1630 int recursion_depth) { 1628 int recursion_depth) {
1631 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1629 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1632 // Alternative 0 is the negative lookahead, alternative 1 is what comes 1630 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1633 // afterwards. 1631 // afterwards.
1634 RegExpNode* node = alternatives_->at(1).node(); 1632 RegExpNode* node = alternatives_->at(1).node();
1635 return node->EatsAtLeast(still_to_find, recursion_depth + 1); 1633 return node->EatsAtLeast(still_to_find, recursion_depth + 1);
1636 } 1634 }
1637 1635
1638 1636
1639 void NegativeLookaheadChoiceNode::GetQuickCheckDetails( 1637 void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
1640 QuickCheckDetails* details, 1638 QuickCheckDetails* details,
(...skipping 401 matching lines...) Expand 10 before | Expand all | Expand 10 after
2042 details->Merge(&new_details, characters_filled_in); 2040 details->Merge(&new_details, characters_filled_in);
2043 } 2041 }
2044 } 2042 }
2045 2043
2046 2044
2047 // Check for [0-9A-Z_a-z]. 2045 // Check for [0-9A-Z_a-z].
2048 static void EmitWordCheck(RegExpMacroAssembler* assembler, 2046 static void EmitWordCheck(RegExpMacroAssembler* assembler,
2049 Label* word, 2047 Label* word,
2050 Label* non_word, 2048 Label* non_word,
2051 bool fall_through_on_word) { 2049 bool fall_through_on_word) {
2050 if (assembler->CheckSpecialCharacterClass(
2051 fall_through_on_word ? 'w' : 'W',
2052 fall_through_on_word ? non_word : word)) {
2053 // Optimized implementation available.
2054 return;
2055 }
2052 assembler->CheckCharacterGT('z', non_word); 2056 assembler->CheckCharacterGT('z', non_word);
2053 assembler->CheckCharacterLT('0', non_word); 2057 assembler->CheckCharacterLT('0', non_word);
2054 assembler->CheckCharacterGT('a' - 1, word); 2058 assembler->CheckCharacterGT('a' - 1, word);
2055 assembler->CheckCharacterLT('9' + 1, word); 2059 assembler->CheckCharacterLT('9' + 1, word);
2056 assembler->CheckCharacterLT('A', non_word); 2060 assembler->CheckCharacterLT('A', non_word);
2057 assembler->CheckCharacterLT('Z' + 1, word); 2061 assembler->CheckCharacterLT('Z' + 1, word);
2058 if (fall_through_on_word) { 2062 if (fall_through_on_word) {
2059 assembler->CheckNotCharacter('_', non_word); 2063 assembler->CheckNotCharacter('_', non_word);
2060 } else { 2064 } else {
2061 assembler->CheckCharacter('_', word); 2065 assembler->CheckCharacter('_', word);
(...skipping 16 matching lines...) Expand all
2078 if (new_trace.cp_offset() == 0) { 2082 if (new_trace.cp_offset() == 0) {
2079 // The start of input counts as a newline in this context, so skip to 2083 // The start of input counts as a newline in this context, so skip to
2080 // ok if we are at the start. 2084 // ok if we are at the start.
2081 assembler->CheckAtStart(&ok); 2085 assembler->CheckAtStart(&ok);
2082 } 2086 }
2083 // We already checked that we are not at the start of input so it must be 2087 // We already checked that we are not at the start of input so it must be
2084 // OK to load the previous character. 2088 // OK to load the previous character.
2085 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, 2089 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2086 new_trace.backtrack(), 2090 new_trace.backtrack(),
2087 false); 2091 false);
2088 // Newline means \n, \r, 0x2028 or 0x2029. 2092 if (!assembler->CheckSpecialCharacterClass('n',
2089 if (!compiler->ascii()) { 2093 new_trace.backtrack())) {
2090 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); 2094 // Newline means \n, \r, 0x2028 or 0x2029.
2095 if (!compiler->ascii()) {
2096 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2097 }
2098 assembler->CheckCharacter('\n', &ok);
2099 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2091 } 2100 }
2092 assembler->CheckCharacter('\n', &ok);
2093 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2094 assembler->Bind(&ok); 2101 assembler->Bind(&ok);
2095 on_success->Emit(compiler, &new_trace); 2102 on_success->Emit(compiler, &new_trace);
2096 } 2103 }
2097 2104
2098 2105
2106 // Emit the code to handle \b and \B (word-boundary or non-word-boundary)
2107 // when we know whether the next character must be a word character or not.
2108 static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type,
2109 RegExpCompiler* compiler,
2110 RegExpNode* on_success,
2111 Trace* trace) {
2112 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2113 Label done;
2114
2115 Trace new_trace(*trace);
2116
2117 bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER);
2118 Label* on_word = expect_word_character ? &done : new_trace.backtrack();
2119 Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done;
2120
2121 // Check whether previous character was a word character.
2122 switch(trace->at_start()) {
2123 case Trace::TRUE:
2124 if (expect_word_character) {
2125 assembler->GoTo(on_non_word);
2126 }
2127 break;
2128 case Trace::UNKNOWN:
2129 ASSERT_EQ(0, trace->cp_offset());
2130 assembler->CheckAtStart(on_non_word);
2131 // Fall through.
2132 case Trace::FALSE:
2133 int prev_char_offset = trace->cp_offset() - 1;
2134 assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1);
2135 EmitWordCheck(assembler, on_word, on_non_word, expect_word_character);
2136 // We may or may not have loaded the previous character.
2137 new_trace.InvalidateCurrentCharacter();
2138 }
2139
2140 assembler->Bind(&done);
2141
2142 on_success->Emit(compiler, &new_trace);
2143 }
2144
2145
2099 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 2146 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2100 static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type, 2147 static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
2101 RegExpCompiler* compiler, 2148 RegExpCompiler* compiler,
2102 RegExpNode* on_success, 2149 RegExpNode* on_success,
2103 Trace* trace) { 2150 Trace* trace) {
2104 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2151 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2105 Label before_non_word; 2152 Label before_non_word;
2106 Label before_word; 2153 Label before_word;
2107 if (trace->characters_preloaded() != 1) { 2154 if (trace->characters_preloaded() != 1) {
2108 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 2155 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
(...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after
2198 Trace at_start_trace = *trace; 2245 Trace at_start_trace = *trace;
2199 at_start_trace.set_at_start(true); 2246 at_start_trace.set_at_start(true);
2200 on_success()->Emit(compiler, &at_start_trace); 2247 on_success()->Emit(compiler, &at_start_trace);
2201 return; 2248 return;
2202 } 2249 }
2203 } 2250 }
2204 break; 2251 break;
2205 case AFTER_NEWLINE: 2252 case AFTER_NEWLINE:
2206 EmitHat(compiler, on_success(), trace); 2253 EmitHat(compiler, on_success(), trace);
2207 return; 2254 return;
2208 case AT_NON_BOUNDARY:
2209 case AT_BOUNDARY: 2255 case AT_BOUNDARY:
2256 case AT_NON_BOUNDARY: {
2210 EmitBoundaryCheck(type_, compiler, on_success(), trace); 2257 EmitBoundaryCheck(type_, compiler, on_success(), trace);
2211 return; 2258 return;
2259 }
2260 case AFTER_WORD_CHARACTER:
2261 case AFTER_NONWORD_CHARACTER: {
2262 EmitHalfBoundaryCheck(type_, compiler, on_success(), trace);
2263 }
2212 } 2264 }
2213 on_success()->Emit(compiler, trace); 2265 on_success()->Emit(compiler, trace);
2214 } 2266 }
2215 2267
2216 2268
2217 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { 2269 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2218 if (quick_check == NULL) return false; 2270 if (quick_check == NULL) return false;
2219 if (offset >= quick_check->characters()) return false; 2271 if (offset >= quick_check->characters()) return false;
2220 return quick_check->positions(offset)->determines_perfectly; 2272 return quick_check->positions(offset)->determines_perfectly;
2221 } 2273 }
(...skipping 562 matching lines...) Expand 10 before | Expand all | Expand 10 after
2784 } 2836 }
2785 continue; 2837 continue;
2786 } else { 2838 } else {
2787 // No quick check was generated. Put the full code here. 2839 // No quick check was generated. Put the full code here.
2788 // If this is not the first choice then there could be slow checks from 2840 // If this is not the first choice then there could be slow checks from
2789 // previous cases that go here when they fail. There's no reason to 2841 // previous cases that go here when they fail. There's no reason to
2790 // insist that they preload characters since the slow check we are about 2842 // insist that they preload characters since the slow check we are about
2791 // to generate probably can't use it. 2843 // to generate probably can't use it.
2792 if (i != first_normal_choice) { 2844 if (i != first_normal_choice) {
2793 alt_gen->expects_preload = false; 2845 alt_gen->expects_preload = false;
2794 new_trace.set_characters_preloaded(0); 2846 new_trace.InvalidateCurrentCharacter();
2795 } 2847 }
2796 if (i < choice_count - 1) { 2848 if (i < choice_count - 1) {
2797 new_trace.set_backtrack(&alt_gen->after); 2849 new_trace.set_backtrack(&alt_gen->after);
2798 } 2850 }
2799 generate_full_check_inline = true; 2851 generate_full_check_inline = true;
2800 } 2852 }
2801 if (generate_full_check_inline) { 2853 if (generate_full_check_inline) {
2802 if (new_trace.actions() != NULL) { 2854 if (new_trace.actions() != NULL) {
2803 new_trace.set_flush_budget(new_flush_budget); 2855 new_trace.set_flush_budget(new_flush_budget);
2804 } 2856 }
(...skipping 470 matching lines...) Expand 10 before | Expand all | Expand 10 after
3275 break; 3327 break;
3276 case AssertionNode::AT_BOUNDARY: 3328 case AssertionNode::AT_BOUNDARY:
3277 stream()->Add("label=\"\\b\", shape=septagon"); 3329 stream()->Add("label=\"\\b\", shape=septagon");
3278 break; 3330 break;
3279 case AssertionNode::AT_NON_BOUNDARY: 3331 case AssertionNode::AT_NON_BOUNDARY:
3280 stream()->Add("label=\"\\B\", shape=septagon"); 3332 stream()->Add("label=\"\\B\", shape=septagon");
3281 break; 3333 break;
3282 case AssertionNode::AFTER_NEWLINE: 3334 case AssertionNode::AFTER_NEWLINE:
3283 stream()->Add("label=\"(?<=\\n)\", shape=septagon"); 3335 stream()->Add("label=\"(?<=\\n)\", shape=septagon");
3284 break; 3336 break;
3337 case AssertionNode::AFTER_WORD_CHARACTER:
3338 stream()->Add("label=\"(?<=\\w)\", shape=septagon");
3339 break;
3340 case AssertionNode::AFTER_NONWORD_CHARACTER:
3341 stream()->Add("label=\"(?<=\\W)\", shape=septagon");
3342 break;
3285 } 3343 }
3286 stream()->Add("];\n"); 3344 stream()->Add("];\n");
3287 PrintAttributes(that); 3345 PrintAttributes(that);
3288 RegExpNode* successor = that->on_success(); 3346 RegExpNode* successor = that->on_success();
3289 stream()->Add(" n%p -> n%p;\n", that, successor); 3347 stream()->Add(" n%p -> n%p;\n", that, successor);
3290 Visit(successor); 3348 Visit(successor);
3291 } 3349 }
3292 3350
3293 3351
3294 void DotPrinter::VisitAction(ActionNode* that) { 3352 void DotPrinter::VisitAction(ActionNode* that) {
(...skipping 182 matching lines...) Expand 10 before | Expand all | Expand 10 after
3477 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) { 3535 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3478 set_.set_standard_set_type('S'); 3536 set_.set_standard_set_type('S');
3479 return true; 3537 return true;
3480 } 3538 }
3481 if (CompareInverseRanges(set_.ranges(), 3539 if (CompareInverseRanges(set_.ranges(),
3482 kLineTerminatorRanges, 3540 kLineTerminatorRanges,
3483 kLineTerminatorRangeCount)) { 3541 kLineTerminatorRangeCount)) {
3484 set_.set_standard_set_type('.'); 3542 set_.set_standard_set_type('.');
3485 return true; 3543 return true;
3486 } 3544 }
3545 if (CompareRanges(set_.ranges(),
3546 kLineTerminatorRanges,
3547 kLineTerminatorRangeCount)) {
3548 set_.set_standard_set_type('n');
3549 return true;
3550 }
3551 if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
3552 set_.set_standard_set_type('w');
3553 return true;
3554 }
3555 if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
3556 set_.set_standard_set_type('W');
3557 return true;
3558 }
3487 return false; 3559 return false;
3488 } 3560 }
3489 3561
3490 3562
3491 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 3563 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
3492 RegExpNode* on_success) { 3564 RegExpNode* on_success) {
3493 return new TextNode(this, on_success); 3565 return new TextNode(this, on_success);
3494 } 3566 }
3495 3567
3496 3568
(...skipping 506 matching lines...) Expand 10 before | Expand all | Expand 10 after
4003 start = pos = block_end + 1; 4075 start = pos = block_end + 1;
4004 } 4076 }
4005 } else { 4077 } else {
4006 // Unibrow ranges don't work for high characters due to the "2^11 bug". 4078 // Unibrow ranges don't work for high characters due to the "2^11 bug".
4007 // Therefore we do something dumber for these ranges. 4079 // Therefore we do something dumber for these ranges.
4008 AddUncanonicals(ranges, bottom, top); 4080 AddUncanonicals(ranges, bottom, top);
4009 } 4081 }
4010 } 4082 }
4011 4083
4012 4084
4085 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
4086 ASSERT_NOT_NULL(ranges);
4087 int n = ranges->length();
4088 if (n <= 1) return true;
4089 int max = ranges->at(0).to();
4090 for (int i = 1; i < n; i++) {
4091 CharacterRange next_range = ranges->at(i);
4092 if (next_range.from() <= max + 1) return false;
4093 max = next_range.to();
4094 }
4095 return true;
4096 }
4097
4098 SetRelation CharacterRange::WordCharacterRelation(
4099 ZoneList<CharacterRange>* range) {
4100 ASSERT(IsCanonical(range));
4101 int i = 0; // Word character range index.
4102 int j = 0; // Argument range index.
4103 ASSERT_NE(0, kWordRangeCount);
4104 SetRelation result;
4105 if (range->length() == 0) {
4106 result.SetElementsInSecondSet();
4107 return result;
4108 }
4109 CharacterRange argument_range = range->at(0);
4110 CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]);
4111 while (i < kWordRangeCount && j < range->length()) {
4112 // Check the two ranges for the five cases:
4113 // - no overlap.
4114 // - partial overlap (there are elements in both ranges that isn't
4115 // in the other, and there are also elements that are in both).
4116 // - argument range entirely inside word range.
4117 // - word range entirely inside argument range.
4118 // - ranges are completely equal.
4119
4120 // First check for no overlap. The earlier range is not in the other set.
4121 if (argument_range.from() > word_range.to()) {
4122 // Ranges are disjoint. The earlier word range contains elements that
4123 // cannot be in the argument set.
4124 result.SetElementsInSecondSet();
4125 } else if (word_range.from() > argument_range.to()) {
4126 // Ranges are disjoint. The earlier argument range contains elements that
4127 // cannot be in the word set.
4128 result.SetElementsInFirstSet();
4129 } else if (word_range.from() <= argument_range.from() &&
4130 word_range.to() >= argument_range.from()) {
4131 result.SetElementsInBothSets();
4132 // argument range completely inside word range.
4133 if (word_range.from() < argument_range.from() ||
4134 word_range.to() > argument_range.from()) {
4135 result.SetElementsInSecondSet();
4136 }
4137 } else if (word_range.from() >= argument_range.from() &&
4138 word_range.to() <= argument_range.from()) {
4139 result.SetElementsInBothSets();
4140 result.SetElementsInFirstSet();
4141 } else {
4142 // There is overlap, and neither is a subrange of the other
4143 result.SetElementsInFirstSet();
4144 result.SetElementsInSecondSet();
4145 result.SetElementsInBothSets();
4146 }
4147 if (result.NonTrivialIntersection()) {
4148 // The result is as (im)precise as we can possibly make it.
4149 return result;
4150 }
4151 // Progress the range(s) with minimal to-character.
4152 uc16 word_to = word_range.to();
4153 uc16 argument_to = argument_range.to();
4154 if (argument_to <= word_to) {
4155 j++;
4156 if (j < range->length()) {
4157 argument_range = range->at(j);
4158 }
4159 }
4160 if (word_to <= argument_to) {
4161 i += 2;
4162 if (i < kWordRangeCount) {
4163 word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]);
4164 }
4165 }
4166 }
4167 // Check if anything wasn't compared in the loop.
4168 if (i < kWordRangeCount) {
4169 // word range contains something not in argument range.
4170 result.SetElementsInSecondSet();
4171 } else if (j < range->length()){
4172 // Argument range contains something not in word range.
4173 result.SetElementsInFirstSet();
4174 }
4175
4176 return result;
4177 }
4178
4179
4013 static void AddUncanonicals(ZoneList<CharacterRange>* ranges, 4180 static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
4014 int bottom, 4181 int bottom,
4015 int top) { 4182 int top) {
4016 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 4183 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4017 // Zones with no case mappings. There is a DEBUG-mode loop to assert that 4184 // Zones with no case mappings. There is a DEBUG-mode loop to assert that
4018 // this table is correct. 4185 // this table is correct.
4019 // 0x0600 - 0x0fff 4186 // 0x0600 - 0x0fff
4020 // 0x1100 - 0x1cff 4187 // 0x1100 - 0x1cff
4021 // 0x2000 - 0x20ff 4188 // 0x2000 - 0x20ff
4022 // 0x2200 - 0x23ff 4189 // 0x2200 - 0x23ff
(...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after
4112 4279
4113 ZoneList<CharacterRange>* CharacterSet::ranges() { 4280 ZoneList<CharacterRange>* CharacterSet::ranges() {
4114 if (ranges_ == NULL) { 4281 if (ranges_ == NULL) {
4115 ranges_ = new ZoneList<CharacterRange>(2); 4282 ranges_ = new ZoneList<CharacterRange>(2);
4116 CharacterRange::AddClassEscape(standard_set_type_, ranges_); 4283 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4117 } 4284 }
4118 return ranges_; 4285 return ranges_;
4119 } 4286 }
4120 4287
4121 4288
4289 // Move a number of elements in a zonelist to another position
4290 // in the same list. Handles overlapping source and target areas.
4291 static void MoveRanges(ZoneList<CharacterRange>* list,
4292 int from,
4293 int to,
4294 int count) {
4295 // Ranges are potentially overlapping.
4296 if (from < to) {
4297 for (int i = count - 1; i >= 0; i--) {
4298 list->at(to + i) = list->at(from + i);
4299 }
4300 } else {
4301 for (int i = 0; i < count; i++) {
4302 list->at(to + i) = list->at(from + i);
4303 }
4304 }
4305 }
4306
4307
4308 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
4309 int count,
4310 CharacterRange insert) {
4311 // Inserts a range into list[0..count[, which must be sorted
4312 // by from value and non-overlapping and non-adjacent, using at most
4313 // list[0..count] for the result. Returns the number of resulting
4314 // canonicalized ranges. Inserting a range may collapse existing ranges into
4315 // fewer ranges, so the return value can be anything in the range 1..count+1.
4316 uc16 from = insert.from();
4317 uc16 to = insert.to();
4318 int start_pos = 0;
4319 int end_pos = count;
4320 for (int i = count - 1; i >= 0; i--) {
4321 CharacterRange current = list->at(i);
4322 if (current.from() > to + 1) {
4323 end_pos = i;
4324 } else if (current.to() + 1 < from) {
4325 start_pos = i + 1;
4326 break;
4327 }
4328 }
4329
4330 // Inserted range overlaps, or is adjacent to, ranges at positions
4331 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
4332 // not affected by the insertion.
4333 // If start_pos == end_pos, the range must be inserted before start_pos.
4334 // if start_pos < end_pos, the entire range from start_pos to end_pos
4335 // must be merged with the insert range.
4336
4337 if (start_pos == end_pos) {
4338 // Insert between existing ranges at position start_pos.
4339 if (start_pos < count) {
4340 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
4341 }
4342 list->at(start_pos) = insert;
4343 return count + 1;
4344 }
4345 if (start_pos + 1 == end_pos) {
4346 // Replace single existing range at position start_pos.
4347 CharacterRange to_replace = list->at(start_pos);
4348 int new_from = Min(to_replace.from(), from);
4349 int new_to = Max(to_replace.to(), to);
4350 list->at(start_pos) = CharacterRange(new_from, new_to);
4351 return count;
4352 }
4353 // Replace a number of existing ranges from start_pos to end_pos - 1.
4354 // Move the remaining ranges down.
4355
4356 int new_from = Min(list->at(start_pos).from(), from);
4357 int new_to = Max(list->at(end_pos - 1).to(), to);
4358 if (end_pos < count) {
4359 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
4360 }
4361 list->at(start_pos) = CharacterRange(new_from, new_to);
4362 return count - (end_pos - start_pos) + 1;
4363 }
4364
4365
4366 void CharacterSet::Canonicalize() {
4367 // Special/default classes are always considered canonical. The result
4368 // of calling ranges() will be sorted.
4369 if (ranges_ == NULL) return;
4370 CharacterRange::Canonicalize(ranges_);
4371 }
4372
4373
4374 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
4375 if (character_ranges->length() <= 1) return;
4376 // Check whether ranges are already canonical (increasing, non-overlapping,
4377 // non-adjacent).
4378 int n = character_ranges->length();
4379 int max = character_ranges->at(0).to();
4380 int i = 1;
4381 while (i < n) {
4382 CharacterRange current = character_ranges->at(i);
4383 if (current.from() <= max + 1) {
4384 break;
4385 }
4386 max = current.to();
4387 i++;
4388 }
4389 // Canonical until the i'th range. If that's all of them, we are done.
4390 if (i == n) return;
4391
4392 // The ranges at index i and forward are not canonicalized. Make them so by
4393 // doing the equivalent of insertion sort (inserting each into the previous
4394 // list, in order).
4395 // Notice that inserting a range can reduce the number of ranges in the
4396 // result due to combining of adjacent and overlapping ranges.
4397 int read = i; // Range to insert.
4398 int num_canonical = i; // Length of canonicalized part of list.
4399 do {
4400 num_canonical = InsertRangeInCanonicalList(character_ranges,
4401 num_canonical,
4402 character_ranges->at(read));
4403 read++;
4404 } while (read < n);
4405 character_ranges->Rewind(num_canonical);
4406
4407 ASSERT(CharacterRange::IsCanonical(character_ranges));
4408 }
4409
4410
4411 // Utility function for CharacterRange::Merge. Adds a range at the end of
4412 // a canonicalized range list, if necessary merging the range with the last
4413 // range of the list.
4414 static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) {
4415 if (set == NULL) return;
4416 ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from());
4417 int n = set->length();
4418 if (n > 0) {
4419 CharacterRange lastRange = set->at(n - 1);
4420 if (lastRange.to() == range.from() - 1) {
4421 set->at(n - 1) = CharacterRange(lastRange.from(), range.to());
4422 return;
4423 }
4424 }
4425 set->Add(range);
4426 }
4427
4428
4429 static void AddRangeToSelectedSet(int selector,
4430 ZoneList<CharacterRange>* first_set,
4431 ZoneList<CharacterRange>* second_set,
4432 ZoneList<CharacterRange>* intersection_set,
4433 CharacterRange range) {
4434 switch (selector) {
4435 case kInsideFirst:
4436 AddRangeToSet(first_set, range);
4437 break;
4438 case kInsideSecond:
4439 AddRangeToSet(second_set, range);
4440 break;
4441 case kInsideBoth:
4442 AddRangeToSet(intersection_set, range);
4443 break;
4444 }
4445 }
4446
4447
4448
4449 void CharacterRange::Merge(ZoneList<CharacterRange>* first_set,
4450 ZoneList<CharacterRange>* second_set,
4451 ZoneList<CharacterRange>* first_set_only_out,
4452 ZoneList<CharacterRange>* second_set_only_out,
4453 ZoneList<CharacterRange>* both_sets_out) {
4454 // Inputs are canonicalized.
4455 ASSERT(CharacterRange::IsCanonical(first_set));
4456 ASSERT(CharacterRange::IsCanonical(second_set));
4457 // Outputs are empty, if applicable.
4458 ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0);
4459 ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0);
4460 ASSERT(both_sets_out == NULL || both_sets_out->length() == 0);
4461
4462 // Merge sets by iterating through the lists in order of lowest "from" value,
4463 // and putting intervals into one of three sets.
4464
4465 if (first_set->length() == 0) {
4466 second_set_only_out->AddAll(*second_set);
4467 return;
4468 }
4469 if (second_set->length() == 0) {
4470 first_set_only_out->AddAll(*first_set);
4471 return;
4472 }
4473 // Indices into input lists.
4474 int i1 = 0;
4475 int i2 = 0;
4476 // Cache length of input lists.
4477 int n1 = first_set->length();
4478 int n2 = second_set->length();
4479 // Current range. May be invalid if state is kInsideNone.
4480 int from = 0;
4481 int to = -1;
4482 // Where current range comes from.
4483 int state = kInsideNone;
4484
4485 while (i1 < n1 || i2 < n2) {
4486 CharacterRange next_range;
4487 int range_source;
4488 if (i2 == n2 || first_set->at(i1).from() < second_set->at(i2).from()) {
4489 next_range = first_set->at(i1++);
4490 range_source = kInsideFirst;
4491 } else {
4492 next_range = second_set->at(i2++);
4493 range_source = kInsideSecond;
4494 }
4495 if (to < next_range.from()) {
4496 // Ranges disjoint: |current| |next|
4497 AddRangeToSelectedSet(state,
4498 first_set_only_out,
4499 second_set_only_out,
4500 both_sets_out,
4501 CharacterRange(from, to));
4502 from = next_range.from();
4503 to = next_range.to();
4504 state = range_source;
4505 } else {
4506 if (from < next_range.from()) {
4507 AddRangeToSelectedSet(state,
4508 first_set_only_out,
4509 second_set_only_out,
4510 both_sets_out,
4511 CharacterRange(from, next_range.from()-1));
4512 }
4513 if (to < next_range.to()) {
4514 // Ranges overlap: |current|
4515 // |next|
4516 AddRangeToSelectedSet(state | range_source,
4517 first_set_only_out,
4518 second_set_only_out,
4519 both_sets_out,
4520 CharacterRange(next_range.from(), to));
4521 from = to + 1;
4522 to = next_range.to();
4523 state = range_source;
4524 } else {
4525 // Range included: |current| , possibly ending at same character.
4526 // |next|
4527 AddRangeToSelectedSet(
4528 state | range_source,
4529 first_set_only_out,
4530 second_set_only_out,
4531 both_sets_out,
4532 CharacterRange(next_range.from(), next_range.to()));
4533 from = next_range.to() + 1;
4534 // If ranges end at same character, both ranges are consumed completely.
4535 if (next_range.to() == to) state = kInsideNone;
4536 }
4537 }
4538 }
4539 AddRangeToSelectedSet(state,
4540 first_set_only_out,
4541 second_set_only_out,
4542 both_sets_out,
4543 CharacterRange(from, to));
4544 }
4545
4546
4547 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
4548 ZoneList<CharacterRange>* negated_ranges) {
4549 ASSERT(CharacterRange::IsCanonical(ranges));
4550 ASSERT_EQ(0, negated_ranges->length());
4551 int range_count = ranges->length();
4552 uc16 from = 0;
4553 int i = 0;
4554 if (range_count > 0 && ranges->at(0).from() == 0) {
4555 from = ranges->at(0).to();
4556 i = 1;
4557 }
4558 while (i < range_count) {
4559 CharacterRange range = ranges->at(i);
4560 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
4561 from = range.to();
4562 i++;
4563 }
4564 if (from < String::kMaxUC16CharCode) {
4565 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUC16CharCode));
4566 }
4567 }
4568
4569
4122 4570
4123 // ------------------------------------------------------------------- 4571 // -------------------------------------------------------------------
4124 // Interest propagation 4572 // Interest propagation
4125 4573
4126 4574
4127 RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) { 4575 RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
4128 for (int i = 0; i < siblings_.length(); i++) { 4576 for (int i = 0; i < siblings_.length(); i++) {
4129 RegExpNode* sibling = siblings_.Get(i); 4577 RegExpNode* sibling = siblings_.Get(i);
4130 if (sibling->info()->Matches(info)) 4578 if (sibling->info()->Matches(info))
4131 return sibling; 4579 return sibling;
(...skipping 271 matching lines...) Expand 10 before | Expand all | Expand 10 after
4403 } 4851 }
4404 4852
4405 4853
4406 void Analysis::VisitBackReference(BackReferenceNode* that) { 4854 void Analysis::VisitBackReference(BackReferenceNode* that) {
4407 EnsureAnalyzed(that->on_success()); 4855 EnsureAnalyzed(that->on_success());
4408 } 4856 }
4409 4857
4410 4858
4411 void Analysis::VisitAssertion(AssertionNode* that) { 4859 void Analysis::VisitAssertion(AssertionNode* that) {
4412 EnsureAnalyzed(that->on_success()); 4860 EnsureAnalyzed(that->on_success());
4413 } 4861 AssertionNode::AssertionNodeType type = that->type();
4862 if (type == AssertionNode::AT_BOUNDARY ||
4863 type == AssertionNode::AT_NON_BOUNDARY) {
4864 // Check if the following character is known to be a word character
4865 // or known to not be a word character.
4866 ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet();
4867
4868 CharacterRange::Canonicalize(following_chars);
4869
4870 SetRelation word_relation =
4871 CharacterRange::WordCharacterRelation(following_chars);
4872 if (word_relation.ContainedIn()) {
4873 // Following character is definitely a word character.
4874 type = (type == AssertionNode::AT_BOUNDARY) ?
4875 AssertionNode::AFTER_NONWORD_CHARACTER :
4876 AssertionNode::AFTER_WORD_CHARACTER;
4877 that->set_type(type);
4878 } else if (word_relation.Disjoint()) {
4879 // Following character is definitely *not* a word character.
4880 type = (type == AssertionNode::AT_BOUNDARY) ?
4881 AssertionNode::AFTER_WORD_CHARACTER :
4882 AssertionNode::AFTER_NONWORD_CHARACTER;
4883 that->set_type(type);
4884 }
4885 }
4886 }
4887
4888
4889 ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() {
4890 if (first_character_set_ == NULL) {
4891 if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) {
4892 // If we can't find an exact solution within the budget, we
4893 // set the value to the set of every character, i.e., all characters
4894 // are possible.
4895 ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1);
4896 all_set->Add(CharacterRange::Everything());
4897 first_character_set_ = all_set;
4898 }
4899 }
4900 return first_character_set_;
4901 }
4902
4903
4904 int RegExpNode::ComputeFirstCharacterSet(int budget) {
4905 // Default behavior is to not be able to determine the first character.
4906 return kComputeFirstCharacterSetFail;
4907 }
4908
4909
4910 int LoopChoiceNode::ComputeFirstCharacterSet(int budget) {
4911 budget--;
4912 if (budget >= 0) {
4913 // Find loop min-iteration. It's the value of the guarded choice node
4914 // with a GEQ guard, if any.
4915 int min_repetition = 0;
4916
4917 for (int i = 0; i <= 1; i++) {
4918 GuardedAlternative alternative = alternatives()->at(i);
4919 ZoneList<Guard*>* guards = alternative.guards();
4920 if (guards != NULL && guards->length() > 0) {
4921 Guard* guard = guards->at(0);
4922 if (guard->op() == Guard::GEQ) {
4923 min_repetition = guard->value();
4924 break;
4925 }
4926 }
4927 }
4928
4929 budget = loop_node()->ComputeFirstCharacterSet(budget);
4930 if (budget >= 0) {
4931 ZoneList<CharacterRange>* character_set =
4932 loop_node()->first_character_set();
4933 if (body_can_be_zero_length() || min_repetition == 0) {
4934 budget = continue_node()->ComputeFirstCharacterSet(budget);
4935 if (budget < 0) return budget;
4936 ZoneList<CharacterRange>* body_set =
4937 continue_node()->first_character_set();
4938 ZoneList<CharacterRange>* union_set =
4939 new ZoneList<CharacterRange>(Max(character_set->length(),
4940 body_set->length()));
4941 CharacterRange::Merge(character_set,
4942 body_set,
4943 union_set,
4944 union_set,
4945 union_set);
4946 character_set = union_set;
4947 }
4948 set_first_character_set(character_set);
4949 }
4950 }
4951 return budget;
4952 }
4953
4954
4955 int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) {
4956 budget--;
4957 if (budget >= 0) {
4958 GuardedAlternative successor = this->alternatives()->at(1);
4959 RegExpNode* successor_node = successor.node();
4960 budget = successor_node->ComputeFirstCharacterSet(budget);
4961 if (budget >= 0) {
4962 set_first_character_set(successor_node->first_character_set());
4963 }
4964 }
4965 return budget;
4966 }
4967
4968
4969 // The first character set of an EndNode is unknowable. Just use the
4970 // default implementation that fails and returns all characters as possible.
4971
4972
4973 int AssertionNode::ComputeFirstCharacterSet(int budget) {
4974 budget -= 1;
4975 if (budget >= 0) {
4976 switch (type_) {
4977 case AT_END: {
4978 set_first_character_set(new ZoneList<CharacterRange>(0));
4979 break;
4980 }
4981 case AT_START:
4982 case AT_BOUNDARY:
4983 case AT_NON_BOUNDARY:
4984 case AFTER_NEWLINE:
4985 case AFTER_NONWORD_CHARACTER:
4986 case AFTER_WORD_CHARACTER: {
4987 ASSERT_NOT_NULL(on_success());
4988 budget = on_success()->ComputeFirstCharacterSet(budget);
4989 set_first_character_set(on_success()->first_character_set());
4990 break;
4991 }
4992 }
4993 }
4994 return budget;
4995 }
4996
4997
4998 int ActionNode::ComputeFirstCharacterSet(int budget) {
4999 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail;
5000 budget--;
5001 if (budget >= 0) {
5002 ASSERT_NOT_NULL(on_success());
5003 budget = on_success()->ComputeFirstCharacterSet(budget);
5004 if (budget >= 0) {
5005 set_first_character_set(on_success()->first_character_set());
5006 }
5007 }
5008 return budget;
5009 }
5010
5011
5012 int BackReferenceNode::ComputeFirstCharacterSet(int budget) {
5013 // We don't know anything about the first character of a backreference
5014 // at this point.
5015 return kComputeFirstCharacterSetFail;
5016 }
5017
5018
5019 int TextNode::ComputeFirstCharacterSet(int budget) {
5020 budget--;
5021 if (budget >= 0) {
5022 ASSERT_NE(0, elements()->length());
5023 TextElement text = elements()->at(0);
5024 if (text.type == TextElement::ATOM) {
5025 RegExpAtom* atom = text.data.u_atom;
5026 ASSERT_NE(0, atom->length());
5027 uc16 first_char = atom->data()[0];
5028 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1);
5029 range->Add(CharacterRange(first_char, first_char));
5030 set_first_character_set(range);
5031 } else {
5032 ASSERT(text.type == TextElement::CHAR_CLASS);
5033 RegExpCharacterClass* char_class = text.data.u_char_class;
5034 if (char_class->is_negated()) {
5035 ZoneList<CharacterRange>* ranges = char_class->ranges();
5036 int length = ranges->length();
5037 int new_length = length + 1;
5038 if (length > 0) {
5039 if (ranges->at(0).from() == 0) new_length--;
5040 if (ranges->at(length - 1).to() == String::kMaxUC16CharCode) {
5041 new_length--;
5042 }
5043 }
5044 ZoneList<CharacterRange>* negated_ranges =
5045 new ZoneList<CharacterRange>(new_length);
5046 CharacterRange::Negate(ranges, negated_ranges);
5047 set_first_character_set(negated_ranges);
5048 } else {
5049 set_first_character_set(char_class->ranges());
5050 }
5051 }
5052 }
5053 return budget;
5054 }
5055
4414 5056
4415 5057
4416 // ------------------------------------------------------------------- 5058 // -------------------------------------------------------------------
4417 // Dispatch table construction 5059 // Dispatch table construction
4418 5060
4419 5061
4420 void DispatchTableConstructor::VisitEnd(EndNode* that) { 5062 void DispatchTableConstructor::VisitEnd(EndNode* that) {
4421 AddRange(CharacterRange::Everything()); 5063 AddRange(CharacterRange::Everything());
4422 } 5064 }
4423 5065
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
4464 AddRange(CharacterRange::Everything()); 5106 AddRange(CharacterRange::Everything());
4465 } 5107 }
4466 5108
4467 5109
4468 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) { 5110 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
4469 RegExpNode* target = that->on_success(); 5111 RegExpNode* target = that->on_success();
4470 target->Accept(this); 5112 target->Accept(this);
4471 } 5113 }
4472 5114
4473 5115
4474
4475 static int CompareRangeByFrom(const CharacterRange* a, 5116 static int CompareRangeByFrom(const CharacterRange* a,
4476 const CharacterRange* b) { 5117 const CharacterRange* b) {
4477 return Compare<uc16>(a->from(), b->from()); 5118 return Compare<uc16>(a->from(), b->from());
4478 } 5119 }
4479 5120
4480 5121
4481 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) { 5122 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
4482 ranges->Sort(CompareRangeByFrom); 5123 ranges->Sort(CompareRangeByFrom);
4483 uc16 last = 0; 5124 uc16 last = 0;
4484 for (int i = 0; i < ranges->length(); i++) { 5125 for (int i = 0; i < ranges->length(); i++) {
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after
4600 RegExpMacroAssemblerIrregexp macro_assembler(codes); 5241 RegExpMacroAssemblerIrregexp macro_assembler(codes);
4601 #endif 5242 #endif
4602 5243
4603 return compiler.Assemble(&macro_assembler, 5244 return compiler.Assemble(&macro_assembler,
4604 node, 5245 node,
4605 data->capture_count, 5246 data->capture_count,
4606 pattern); 5247 pattern);
4607 } 5248 }
4608 5249
4609 }} // namespace v8::internal 5250 }} // namespace v8::internal
OLDNEW
« no previous file with comments | « src/jsregexp.h ('k') | src/parser.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698