Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(65)

Side by Side Diff: src/regexp/jsregexp.cc

Issue 2725583002: [regexp] fix /\W/ui wrt \u017f and \u212a. (Closed)
Patch Set: Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | src/regexp/regexp-ast.h » ('j') | src/regexp/regexp-ast.h » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/jsregexp.h" 5 #include "src/regexp/jsregexp.h"
6 6
7 #include <memory> 7 #include <memory>
8 8
9 #include "src/base/platform/platform.h" 9 #include "src/base/platform/platform.h"
10 #include "src/compilation-cache.h" 10 #include "src/compilation-cache.h"
(...skipping 5090 matching lines...) Expand 10 before | Expand all | Expand 10 after
5101 Zone* zone = compiler->zone(); 5101 Zone* zone = compiler->zone();
5102 // Advance any character. If the character happens to be a lead surrogate and 5102 // Advance any character. If the character happens to be a lead surrogate and
5103 // we advanced into the middle of a surrogate pair, it will work out, as 5103 // we advanced into the middle of a surrogate pair, it will work out, as
5104 // nothing will match from there. We will have to advance again, consuming 5104 // nothing will match from there. We will have to advance again, consuming
5105 // the associated trail surrogate. 5105 // the associated trail surrogate.
5106 ZoneList<CharacterRange>* range = CharacterRange::List( 5106 ZoneList<CharacterRange>* range = CharacterRange::List(
5107 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit)); 5107 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5108 return TextNode::CreateForCharacterRanges(zone, range, false, on_success); 5108 return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
5109 } 5109 }
5110 5110
5111 5111 void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
5112 void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
5113 ZoneList<CharacterRange>* ranges) {
5114 #ifdef V8_I18N_SUPPORT 5112 #ifdef V8_I18N_SUPPORT
5115 // Use ICU to compute the case fold closure over the ranges. 5113 // Use ICU to compute the case fold closure over the ranges.
5116 DCHECK(compiler->unicode());
5117 DCHECK(compiler->ignore_case());
5118 icu::UnicodeSet set; 5114 icu::UnicodeSet set;
5119 for (int i = 0; i < ranges->length(); i++) { 5115 for (int i = 0; i < ranges->length(); i++) {
5120 set.add(ranges->at(i).from(), ranges->at(i).to()); 5116 set.add(ranges->at(i).from(), ranges->at(i).to());
5121 } 5117 }
5122 ranges->Clear(); 5118 ranges->Clear();
5123 set.closeOver(USET_CASE_INSENSITIVE); 5119 set.closeOver(USET_CASE_INSENSITIVE);
5124 // Full case mapping map single characters to multiple characters. 5120 // Full case mapping map single characters to multiple characters.
5125 // Those are represented as strings in the set. Remove them so that 5121 // Those are represented as strings in the set. Remove them so that
5126 // we end up with only simple and common case mappings. 5122 // we end up with only simple and common case mappings.
5127 set.removeAllStrings(); 5123 set.removeAllStrings();
5128 Zone* zone = compiler->zone();
5129 for (int i = 0; i < set.getRangeCount(); i++) { 5124 for (int i = 0; i < set.getRangeCount(); i++) {
5130 ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), 5125 ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
5131 zone); 5126 zone);
5132 } 5127 }
5133 // No errors and everything we collected have been ranges. 5128 // No errors and everything we collected have been ranges.
5134 #else 5129 CharacterRange::Canonicalize(ranges);
5135 // Fallback if ICU is not included.
5136 CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
5137 ranges, compiler->one_byte());
5138 #endif // V8_I18N_SUPPORT 5130 #endif // V8_I18N_SUPPORT
5139 CharacterRange::Canonicalize(ranges);
5140 } 5131 }
5141 5132
5142 5133
5143 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 5134 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5144 RegExpNode* on_success) { 5135 RegExpNode* on_success) {
5145 set_.Canonicalize(); 5136 set_.Canonicalize();
5146 Zone* zone = compiler->zone(); 5137 Zone* zone = compiler->zone();
5147 ZoneList<CharacterRange>* ranges = this->ranges(zone); 5138 ZoneList<CharacterRange>* ranges = this->ranges(zone);
5148 if (compiler->unicode() && compiler->ignore_case()) { 5139 if (compiler->unicode() && compiler->ignore_case()) {
5149 AddUnicodeCaseEquivalents(compiler, ranges); 5140 AddUnicodeCaseEquivalents(ranges, zone);
5150 } 5141 }
5151 if (compiler->unicode() && !compiler->one_byte()) { 5142 if (compiler->unicode() && !compiler->one_byte()) {
5152 if (is_negated()) { 5143 if (is_negated()) {
5153 ZoneList<CharacterRange>* negated = 5144 ZoneList<CharacterRange>* negated =
5154 new (zone) ZoneList<CharacterRange>(2, zone); 5145 new (zone) ZoneList<CharacterRange>(2, zone);
5155 CharacterRange::Negate(ranges, negated, zone); 5146 CharacterRange::Negate(ranges, negated, zone);
5156 ranges = negated; 5147 ranges = negated;
5157 } 5148 }
5158 if (ranges->length() == 0) { 5149 if (ranges->length() == 0) {
5159 ranges->Add(CharacterRange::Everything(), zone); 5150 ranges->Add(CharacterRange::Everything(), zone);
(...skipping 452 matching lines...) Expand 10 before | Expand all | Expand 10 after
5612 center->AddContinueAlternative(rest_alt); 5603 center->AddContinueAlternative(rest_alt);
5613 center->AddLoopAlternative(body_alt); 5604 center->AddLoopAlternative(body_alt);
5614 } 5605 }
5615 if (needs_counter) { 5606 if (needs_counter) {
5616 return ActionNode::SetRegister(reg_ctr, 0, center); 5607 return ActionNode::SetRegister(reg_ctr, 0, center);
5617 } else { 5608 } else {
5618 return center; 5609 return center;
5619 } 5610 }
5620 } 5611 }
5621 5612
5613 namespace {
5614 // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
5615 // \B to (?<=\W)(?=\W)|(?<=\w)(?=\w)
jgruber 2017/02/28 13:44:56 Nit: Please swap the group order of \B to make the
Yang 2017/02/28 14:26:11 Done.
5616 RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
5617 RegExpNode* on_success,
5618 RegExpAssertion::AssertionType type) {
5619 DCHECK(compiler->unicode() && compiler->ignore_case());
5620 Zone* zone = compiler->zone();
5621 ZoneList<CharacterRange>* word_range =
5622 new (zone) ZoneList<CharacterRange>(2, zone);
5623 CharacterRange::AddClassEscape('w', word_range, true, zone);
5624 int stack_register = compiler->UnicodeLookaroundStackRegister();
5625 int position_register = compiler->UnicodeLookaroundPositionRegister();
5626 ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5627 // Add two choices. The (non-)boundary could start with a word or
5628 // a non-word-character.
5629 for (int i = 0; i < 2; i++) {
5630 bool lookbehind_for_word = i == 0;
5631 bool lookahead_for_word =
5632 (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
5633 // Look to the left.
5634 RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
5635 stack_register, position_register);
5636 RegExpNode* backward = TextNode::CreateForCharacterRanges(
5637 zone, word_range, true, lookbehind.on_match_success());
5638 // Look to the right.
5639 RegExpLookaround::Builder lookahead(lookahead_for_word,
5640 lookbehind.ForMatch(backward),
5641 stack_register, position_register);
5642 RegExpNode* forward = TextNode::CreateForCharacterRanges(
5643 zone, word_range, false, lookahead.on_match_success());
5644 result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
5645 }
5646 return result;
5647 }
5648 } // anonymous namespace
5622 5649
5623 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, 5650 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5624 RegExpNode* on_success) { 5651 RegExpNode* on_success) {
5625 NodeInfo info; 5652 NodeInfo info;
5626 Zone* zone = compiler->zone(); 5653 Zone* zone = compiler->zone();
5627 5654
5628 switch (assertion_type()) { 5655 switch (assertion_type()) {
5629 case START_OF_LINE: 5656 case START_OF_LINE:
5630 return AssertionNode::AfterNewline(on_success); 5657 return AssertionNode::AfterNewline(on_success);
5631 case START_OF_INPUT: 5658 case START_OF_INPUT:
5632 return AssertionNode::AtStart(on_success); 5659 return AssertionNode::AtStart(on_success);
5633 case BOUNDARY: 5660 case BOUNDARY:
5634 return AssertionNode::AtBoundary(on_success); 5661 return compiler->unicode() && compiler->ignore_case()
jgruber 2017/02/28 13:44:56 WDYT about adding compiler->needs_unicode_case_equ
Yang 2017/02/28 14:26:11 Done.
5662 ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY)
5663 : AssertionNode::AtBoundary(on_success);
5635 case NON_BOUNDARY: 5664 case NON_BOUNDARY:
5636 return AssertionNode::AtNonBoundary(on_success); 5665 return compiler->unicode() && compiler->ignore_case()
5666 ? BoundaryAssertionAsLookaround(compiler, on_success,
5667 NON_BOUNDARY)
5668 : AssertionNode::AtNonBoundary(on_success);
5637 case END_OF_INPUT: 5669 case END_OF_INPUT:
5638 return AssertionNode::AtEnd(on_success); 5670 return AssertionNode::AtEnd(on_success);
5639 case END_OF_LINE: { 5671 case END_OF_LINE: {
5640 // Compile $ in multiline regexps as an alternation with a positive 5672 // Compile $ in multiline regexps as an alternation with a positive
5641 // lookahead in one side and an end-of-input on the other side. 5673 // lookahead in one side and an end-of-input on the other side.
5642 // We need two registers for the lookahead. 5674 // We need two registers for the lookahead.
5643 int stack_pointer_register = compiler->AllocateRegister(); 5675 int stack_pointer_register = compiler->AllocateRegister();
5644 int position_register = compiler->AllocateRegister(); 5676 int position_register = compiler->AllocateRegister();
5645 // The ChoiceNode to distinguish between a newline and end-of-input. 5677 // The ChoiceNode to distinguish between a newline and end-of-input.
5646 ChoiceNode* result = new(zone) ChoiceNode(2, zone); 5678 ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5647 // Create a newline atom. 5679 // Create a newline atom.
5648 ZoneList<CharacterRange>* newline_ranges = 5680 ZoneList<CharacterRange>* newline_ranges =
5649 new(zone) ZoneList<CharacterRange>(3, zone); 5681 new(zone) ZoneList<CharacterRange>(3, zone);
5650 CharacterRange::AddClassEscape('n', newline_ranges, zone); 5682 CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
5651 RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n'); 5683 RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
5652 TextNode* newline_matcher = new (zone) TextNode( 5684 TextNode* newline_matcher = new (zone) TextNode(
5653 newline_atom, false, ActionNode::PositiveSubmatchSuccess( 5685 newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5654 stack_pointer_register, position_register, 5686 stack_pointer_register, position_register,
5655 0, // No captures inside. 5687 0, // No captures inside.
5656 -1, // Ignored if no captures. 5688 -1, // Ignored if no captures.
5657 on_success)); 5689 on_success));
5658 // Create an end-of-input matcher. 5690 // Create an end-of-input matcher.
5659 RegExpNode* end_of_line = ActionNode::BeginSubmatch( 5691 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5660 stack_pointer_register, 5692 stack_pointer_register,
(...skipping 153 matching lines...) Expand 10 before | Expand all | Expand 10 after
5814 uc16 last = 0x0000; 5846 uc16 last = 0x0000;
5815 for (int i = 0; i < elmc; i += 2) { 5847 for (int i = 0; i < elmc; i += 2) {
5816 DCHECK(last <= elmv[i] - 1); 5848 DCHECK(last <= elmv[i] - 1);
5817 DCHECK(elmv[i] < elmv[i + 1]); 5849 DCHECK(elmv[i] < elmv[i + 1]);
5818 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone); 5850 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
5819 last = elmv[i + 1]; 5851 last = elmv[i + 1];
5820 } 5852 }
5821 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone); 5853 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
5822 } 5854 }
5823 5855
5856 void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
5857 bool add_unicode_case_equivalents,
5858 Zone* zone) {
5859 if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
5860 // See #sec-runtime-semantics-wordcharacters-abstract-operation
5861 // In case of unicode and ignore_case, we need to create the closure over
5862 // case equivalent characters before negating.
5863 ZoneList<CharacterRange>* new_ranges =
5864 new (zone) ZoneList<CharacterRange>(2, zone);
5865 AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
5866 AddUnicodeCaseEquivalents(new_ranges, zone);
5867 if (type == 'W') {
5868 ZoneList<CharacterRange>* negated =
5869 new (zone) ZoneList<CharacterRange>(2, zone);
5870 CharacterRange::Negate(new_ranges, negated, zone);
5871 new_ranges = negated;
5872 }
5873 ranges->AddAll(*new_ranges, zone);
5874 return;
5875 }
5824 5876
5825 void CharacterRange::AddClassEscape(uc16 type,
5826 ZoneList<CharacterRange>* ranges,
5827 Zone* zone) {
5828 switch (type) { 5877 switch (type) {
5829 case 's': 5878 case 's':
5830 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5879 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5831 break; 5880 break;
5832 case 'S': 5881 case 'S':
5833 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5882 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5834 break; 5883 break;
5835 case 'w': 5884 case 'w':
5836 AddClass(kWordRanges, kWordRangeCount, ranges, zone); 5885 AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5837 break; 5886 break;
(...skipping 120 matching lines...) Expand 10 before | Expand all | Expand 10 after
5958 if (next_range.from() <= max + 1) return false; 6007 if (next_range.from() <= max + 1) return false;
5959 max = next_range.to(); 6008 max = next_range.to();
5960 } 6009 }
5961 return true; 6010 return true;
5962 } 6011 }
5963 6012
5964 6013
5965 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { 6014 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5966 if (ranges_ == NULL) { 6015 if (ranges_ == NULL) {
5967 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone); 6016 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5968 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone); 6017 CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
5969 } 6018 }
5970 return ranges_; 6019 return ranges_;
5971 } 6020 }
5972 6021
5973 6022
5974 // Move a number of elements in a zonelist to another position 6023 // Move a number of elements in a zonelist to another position
5975 // in the same list. Handles overlapping source and target areas. 6024 // in the same list. Handles overlapping source and target areas.
5976 static void MoveRanges(ZoneList<CharacterRange>* list, 6025 static void MoveRanges(ZoneList<CharacterRange>* list,
5977 int from, 6026 int from,
5978 int to, 6027 int to,
(...skipping 885 matching lines...) Expand 10 before | Expand all | Expand 10 after
6864 6913
6865 6914
6866 void RegExpResultsCache::Clear(FixedArray* cache) { 6915 void RegExpResultsCache::Clear(FixedArray* cache) {
6867 for (int i = 0; i < kRegExpResultsCacheSize; i++) { 6916 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6868 cache->set(i, Smi::kZero); 6917 cache->set(i, Smi::kZero);
6869 } 6918 }
6870 } 6919 }
6871 6920
6872 } // namespace internal 6921 } // namespace internal
6873 } // namespace v8 6922 } // namespace v8
OLDNEW
« no previous file with comments | « no previous file | src/regexp/regexp-ast.h » ('j') | src/regexp/regexp-ast.h » ('J')

Powered by Google App Engine
This is Rietveld 408576698