OLD | NEW |
---|---|
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/regexp/jsregexp.h" | 5 #include "src/regexp/jsregexp.h" |
6 | 6 |
7 #include <memory> | 7 #include <memory> |
8 | 8 |
9 #include "src/base/platform/platform.h" | 9 #include "src/base/platform/platform.h" |
10 #include "src/compilation-cache.h" | 10 #include "src/compilation-cache.h" |
(...skipping 5090 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
5101 Zone* zone = compiler->zone(); | 5101 Zone* zone = compiler->zone(); |
5102 // Advance any character. If the character happens to be a lead surrogate and | 5102 // Advance any character. If the character happens to be a lead surrogate and |
5103 // we advanced into the middle of a surrogate pair, it will work out, as | 5103 // we advanced into the middle of a surrogate pair, it will work out, as |
5104 // nothing will match from there. We will have to advance again, consuming | 5104 // nothing will match from there. We will have to advance again, consuming |
5105 // the associated trail surrogate. | 5105 // the associated trail surrogate. |
5106 ZoneList<CharacterRange>* range = CharacterRange::List( | 5106 ZoneList<CharacterRange>* range = CharacterRange::List( |
5107 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit)); | 5107 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit)); |
5108 return TextNode::CreateForCharacterRanges(zone, range, false, on_success); | 5108 return TextNode::CreateForCharacterRanges(zone, range, false, on_success); |
5109 } | 5109 } |
5110 | 5110 |
5111 | 5111 void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) { |
5112 void AddUnicodeCaseEquivalents(RegExpCompiler* compiler, | |
5113 ZoneList<CharacterRange>* ranges) { | |
5114 #ifdef V8_I18N_SUPPORT | 5112 #ifdef V8_I18N_SUPPORT |
5115 // Use ICU to compute the case fold closure over the ranges. | 5113 // Use ICU to compute the case fold closure over the ranges. |
5116 DCHECK(compiler->unicode()); | |
5117 DCHECK(compiler->ignore_case()); | |
5118 icu::UnicodeSet set; | 5114 icu::UnicodeSet set; |
5119 for (int i = 0; i < ranges->length(); i++) { | 5115 for (int i = 0; i < ranges->length(); i++) { |
5120 set.add(ranges->at(i).from(), ranges->at(i).to()); | 5116 set.add(ranges->at(i).from(), ranges->at(i).to()); |
5121 } | 5117 } |
5122 ranges->Clear(); | 5118 ranges->Clear(); |
5123 set.closeOver(USET_CASE_INSENSITIVE); | 5119 set.closeOver(USET_CASE_INSENSITIVE); |
5124 // Full case mapping map single characters to multiple characters. | 5120 // Full case mapping map single characters to multiple characters. |
5125 // Those are represented as strings in the set. Remove them so that | 5121 // Those are represented as strings in the set. Remove them so that |
5126 // we end up with only simple and common case mappings. | 5122 // we end up with only simple and common case mappings. |
5127 set.removeAllStrings(); | 5123 set.removeAllStrings(); |
5128 Zone* zone = compiler->zone(); | |
5129 for (int i = 0; i < set.getRangeCount(); i++) { | 5124 for (int i = 0; i < set.getRangeCount(); i++) { |
5130 ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), | 5125 ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), |
5131 zone); | 5126 zone); |
5132 } | 5127 } |
5133 // No errors and everything we collected have been ranges. | 5128 // No errors and everything we collected have been ranges. |
5134 #else | 5129 CharacterRange::Canonicalize(ranges); |
5135 // Fallback if ICU is not included. | |
5136 CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(), | |
5137 ranges, compiler->one_byte()); | |
5138 #endif // V8_I18N_SUPPORT | 5130 #endif // V8_I18N_SUPPORT |
5139 CharacterRange::Canonicalize(ranges); | |
5140 } | 5131 } |
5141 | 5132 |
5142 | 5133 |
5143 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, | 5134 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, |
5144 RegExpNode* on_success) { | 5135 RegExpNode* on_success) { |
5145 set_.Canonicalize(); | 5136 set_.Canonicalize(); |
5146 Zone* zone = compiler->zone(); | 5137 Zone* zone = compiler->zone(); |
5147 ZoneList<CharacterRange>* ranges = this->ranges(zone); | 5138 ZoneList<CharacterRange>* ranges = this->ranges(zone); |
5148 if (compiler->unicode() && compiler->ignore_case()) { | 5139 if (compiler->unicode() && compiler->ignore_case()) { |
5149 AddUnicodeCaseEquivalents(compiler, ranges); | 5140 AddUnicodeCaseEquivalents(ranges, zone); |
5150 } | 5141 } |
5151 if (compiler->unicode() && !compiler->one_byte()) { | 5142 if (compiler->unicode() && !compiler->one_byte()) { |
5152 if (is_negated()) { | 5143 if (is_negated()) { |
5153 ZoneList<CharacterRange>* negated = | 5144 ZoneList<CharacterRange>* negated = |
5154 new (zone) ZoneList<CharacterRange>(2, zone); | 5145 new (zone) ZoneList<CharacterRange>(2, zone); |
5155 CharacterRange::Negate(ranges, negated, zone); | 5146 CharacterRange::Negate(ranges, negated, zone); |
5156 ranges = negated; | 5147 ranges = negated; |
5157 } | 5148 } |
5158 if (ranges->length() == 0) { | 5149 if (ranges->length() == 0) { |
5159 ranges->Add(CharacterRange::Everything(), zone); | 5150 ranges->Add(CharacterRange::Everything(), zone); |
(...skipping 452 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
5612 center->AddContinueAlternative(rest_alt); | 5603 center->AddContinueAlternative(rest_alt); |
5613 center->AddLoopAlternative(body_alt); | 5604 center->AddLoopAlternative(body_alt); |
5614 } | 5605 } |
5615 if (needs_counter) { | 5606 if (needs_counter) { |
5616 return ActionNode::SetRegister(reg_ctr, 0, center); | 5607 return ActionNode::SetRegister(reg_ctr, 0, center); |
5617 } else { | 5608 } else { |
5618 return center; | 5609 return center; |
5619 } | 5610 } |
5620 } | 5611 } |
5621 | 5612 |
5613 namespace { | |
5614 // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and | |
5615 // \B to (?<=\W)(?=\W)|(?<=\w)(?=\w) | |
jgruber
2017/02/28 13:44:56
Nit: Please swap the group order of \B to make the
Yang
2017/02/28 14:26:11
Done.
| |
5616 RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, | |
5617 RegExpNode* on_success, | |
5618 RegExpAssertion::AssertionType type) { | |
5619 DCHECK(compiler->unicode() && compiler->ignore_case()); | |
5620 Zone* zone = compiler->zone(); | |
5621 ZoneList<CharacterRange>* word_range = | |
5622 new (zone) ZoneList<CharacterRange>(2, zone); | |
5623 CharacterRange::AddClassEscape('w', word_range, true, zone); | |
5624 int stack_register = compiler->UnicodeLookaroundStackRegister(); | |
5625 int position_register = compiler->UnicodeLookaroundPositionRegister(); | |
5626 ChoiceNode* result = new (zone) ChoiceNode(2, zone); | |
5627 // Add two choices. The (non-)boundary could start with a word or | |
5628 // a non-word-character. | |
5629 for (int i = 0; i < 2; i++) { | |
5630 bool lookbehind_for_word = i == 0; | |
5631 bool lookahead_for_word = | |
5632 (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word; | |
5633 // Look to the left. | |
5634 RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success, | |
5635 stack_register, position_register); | |
5636 RegExpNode* backward = TextNode::CreateForCharacterRanges( | |
5637 zone, word_range, true, lookbehind.on_match_success()); | |
5638 // Look to the right. | |
5639 RegExpLookaround::Builder lookahead(lookahead_for_word, | |
5640 lookbehind.ForMatch(backward), | |
5641 stack_register, position_register); | |
5642 RegExpNode* forward = TextNode::CreateForCharacterRanges( | |
5643 zone, word_range, false, lookahead.on_match_success()); | |
5644 result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward))); | |
5645 } | |
5646 return result; | |
5647 } | |
5648 } // anonymous namespace | |
5622 | 5649 |
5623 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, | 5650 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, |
5624 RegExpNode* on_success) { | 5651 RegExpNode* on_success) { |
5625 NodeInfo info; | 5652 NodeInfo info; |
5626 Zone* zone = compiler->zone(); | 5653 Zone* zone = compiler->zone(); |
5627 | 5654 |
5628 switch (assertion_type()) { | 5655 switch (assertion_type()) { |
5629 case START_OF_LINE: | 5656 case START_OF_LINE: |
5630 return AssertionNode::AfterNewline(on_success); | 5657 return AssertionNode::AfterNewline(on_success); |
5631 case START_OF_INPUT: | 5658 case START_OF_INPUT: |
5632 return AssertionNode::AtStart(on_success); | 5659 return AssertionNode::AtStart(on_success); |
5633 case BOUNDARY: | 5660 case BOUNDARY: |
5634 return AssertionNode::AtBoundary(on_success); | 5661 return compiler->unicode() && compiler->ignore_case() |
jgruber
2017/02/28 13:44:56
WDYT about adding compiler->needs_unicode_case_equ
Yang
2017/02/28 14:26:11
Done.
| |
5662 ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY) | |
5663 : AssertionNode::AtBoundary(on_success); | |
5635 case NON_BOUNDARY: | 5664 case NON_BOUNDARY: |
5636 return AssertionNode::AtNonBoundary(on_success); | 5665 return compiler->unicode() && compiler->ignore_case() |
5666 ? BoundaryAssertionAsLookaround(compiler, on_success, | |
5667 NON_BOUNDARY) | |
5668 : AssertionNode::AtNonBoundary(on_success); | |
5637 case END_OF_INPUT: | 5669 case END_OF_INPUT: |
5638 return AssertionNode::AtEnd(on_success); | 5670 return AssertionNode::AtEnd(on_success); |
5639 case END_OF_LINE: { | 5671 case END_OF_LINE: { |
5640 // Compile $ in multiline regexps as an alternation with a positive | 5672 // Compile $ in multiline regexps as an alternation with a positive |
5641 // lookahead in one side and an end-of-input on the other side. | 5673 // lookahead in one side and an end-of-input on the other side. |
5642 // We need two registers for the lookahead. | 5674 // We need two registers for the lookahead. |
5643 int stack_pointer_register = compiler->AllocateRegister(); | 5675 int stack_pointer_register = compiler->AllocateRegister(); |
5644 int position_register = compiler->AllocateRegister(); | 5676 int position_register = compiler->AllocateRegister(); |
5645 // The ChoiceNode to distinguish between a newline and end-of-input. | 5677 // The ChoiceNode to distinguish between a newline and end-of-input. |
5646 ChoiceNode* result = new(zone) ChoiceNode(2, zone); | 5678 ChoiceNode* result = new(zone) ChoiceNode(2, zone); |
5647 // Create a newline atom. | 5679 // Create a newline atom. |
5648 ZoneList<CharacterRange>* newline_ranges = | 5680 ZoneList<CharacterRange>* newline_ranges = |
5649 new(zone) ZoneList<CharacterRange>(3, zone); | 5681 new(zone) ZoneList<CharacterRange>(3, zone); |
5650 CharacterRange::AddClassEscape('n', newline_ranges, zone); | 5682 CharacterRange::AddClassEscape('n', newline_ranges, false, zone); |
5651 RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n'); | 5683 RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n'); |
5652 TextNode* newline_matcher = new (zone) TextNode( | 5684 TextNode* newline_matcher = new (zone) TextNode( |
5653 newline_atom, false, ActionNode::PositiveSubmatchSuccess( | 5685 newline_atom, false, ActionNode::PositiveSubmatchSuccess( |
5654 stack_pointer_register, position_register, | 5686 stack_pointer_register, position_register, |
5655 0, // No captures inside. | 5687 0, // No captures inside. |
5656 -1, // Ignored if no captures. | 5688 -1, // Ignored if no captures. |
5657 on_success)); | 5689 on_success)); |
5658 // Create an end-of-input matcher. | 5690 // Create an end-of-input matcher. |
5659 RegExpNode* end_of_line = ActionNode::BeginSubmatch( | 5691 RegExpNode* end_of_line = ActionNode::BeginSubmatch( |
5660 stack_pointer_register, | 5692 stack_pointer_register, |
(...skipping 153 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
5814 uc16 last = 0x0000; | 5846 uc16 last = 0x0000; |
5815 for (int i = 0; i < elmc; i += 2) { | 5847 for (int i = 0; i < elmc; i += 2) { |
5816 DCHECK(last <= elmv[i] - 1); | 5848 DCHECK(last <= elmv[i] - 1); |
5817 DCHECK(elmv[i] < elmv[i + 1]); | 5849 DCHECK(elmv[i] < elmv[i + 1]); |
5818 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone); | 5850 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone); |
5819 last = elmv[i + 1]; | 5851 last = elmv[i + 1]; |
5820 } | 5852 } |
5821 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone); | 5853 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone); |
5822 } | 5854 } |
5823 | 5855 |
5856 void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges, | |
5857 bool add_unicode_case_equivalents, | |
5858 Zone* zone) { | |
5859 if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) { | |
5860 // See #sec-runtime-semantics-wordcharacters-abstract-operation | |
5861 // In case of unicode and ignore_case, we need to create the closure over | |
5862 // case equivalent characters before negating. | |
5863 ZoneList<CharacterRange>* new_ranges = | |
5864 new (zone) ZoneList<CharacterRange>(2, zone); | |
5865 AddClass(kWordRanges, kWordRangeCount, new_ranges, zone); | |
5866 AddUnicodeCaseEquivalents(new_ranges, zone); | |
5867 if (type == 'W') { | |
5868 ZoneList<CharacterRange>* negated = | |
5869 new (zone) ZoneList<CharacterRange>(2, zone); | |
5870 CharacterRange::Negate(new_ranges, negated, zone); | |
5871 new_ranges = negated; | |
5872 } | |
5873 ranges->AddAll(*new_ranges, zone); | |
5874 return; | |
5875 } | |
5824 | 5876 |
5825 void CharacterRange::AddClassEscape(uc16 type, | |
5826 ZoneList<CharacterRange>* ranges, | |
5827 Zone* zone) { | |
5828 switch (type) { | 5877 switch (type) { |
5829 case 's': | 5878 case 's': |
5830 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); | 5879 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); |
5831 break; | 5880 break; |
5832 case 'S': | 5881 case 'S': |
5833 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); | 5882 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); |
5834 break; | 5883 break; |
5835 case 'w': | 5884 case 'w': |
5836 AddClass(kWordRanges, kWordRangeCount, ranges, zone); | 5885 AddClass(kWordRanges, kWordRangeCount, ranges, zone); |
5837 break; | 5886 break; |
(...skipping 120 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
5958 if (next_range.from() <= max + 1) return false; | 6007 if (next_range.from() <= max + 1) return false; |
5959 max = next_range.to(); | 6008 max = next_range.to(); |
5960 } | 6009 } |
5961 return true; | 6010 return true; |
5962 } | 6011 } |
5963 | 6012 |
5964 | 6013 |
5965 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { | 6014 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { |
5966 if (ranges_ == NULL) { | 6015 if (ranges_ == NULL) { |
5967 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone); | 6016 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone); |
5968 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone); | 6017 CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone); |
5969 } | 6018 } |
5970 return ranges_; | 6019 return ranges_; |
5971 } | 6020 } |
5972 | 6021 |
5973 | 6022 |
5974 // Move a number of elements in a zonelist to another position | 6023 // Move a number of elements in a zonelist to another position |
5975 // in the same list. Handles overlapping source and target areas. | 6024 // in the same list. Handles overlapping source and target areas. |
5976 static void MoveRanges(ZoneList<CharacterRange>* list, | 6025 static void MoveRanges(ZoneList<CharacterRange>* list, |
5977 int from, | 6026 int from, |
5978 int to, | 6027 int to, |
(...skipping 885 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
6864 | 6913 |
6865 | 6914 |
6866 void RegExpResultsCache::Clear(FixedArray* cache) { | 6915 void RegExpResultsCache::Clear(FixedArray* cache) { |
6867 for (int i = 0; i < kRegExpResultsCacheSize; i++) { | 6916 for (int i = 0; i < kRegExpResultsCacheSize; i++) { |
6868 cache->set(i, Smi::kZero); | 6917 cache->set(i, Smi::kZero); |
6869 } | 6918 } |
6870 } | 6919 } |
6871 | 6920 |
6872 } // namespace internal | 6921 } // namespace internal |
6873 } // namespace v8 | 6922 } // namespace v8 |
OLD | NEW |