src/regexp/jsregexp.cc - Issue 2725583002: [regexp] fix /\W/ui wrt \u017f and \u212a.

Side by Side Diff: src/regexp/jsregexp.cc

Issue 2725583002: [regexp] fix /\W/ui wrt \u017f and \u212a. (Closed)

Patch Set: Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/jsregexp.h"	5 #include "src/regexp/jsregexp.h"

6	6

7 #include <memory>	7 #include <memory>

8	8

9 #include "src/base/platform/platform.h"	9 #include "src/base/platform/platform.h"

10 #include "src/compilation-cache.h"	10 #include "src/compilation-cache.h"

(...skipping 5090 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5101 Zone* zone = compiler->zone();	5101 Zone* zone = compiler->zone();

5102 // Advance any character. If the character happens to be a lead surrogate and	5102 // Advance any character. If the character happens to be a lead surrogate and

5103 // we advanced into the middle of a surrogate pair, it will work out, as	5103 // we advanced into the middle of a surrogate pair, it will work out, as

5104 // nothing will match from there. We will have to advance again, consuming	5104 // nothing will match from there. We will have to advance again, consuming

5105 // the associated trail surrogate.	5105 // the associated trail surrogate.

5106 ZoneList<CharacterRange>* range = CharacterRange::List(	5106 ZoneList<CharacterRange>* range = CharacterRange::List(

5107 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));	5107 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));

5108 return TextNode::CreateForCharacterRanges(zone, range, false, on_success);	5108 return TextNode::CreateForCharacterRanges(zone, range, false, on_success);

5109 }	5109 }

5110	5110

5111	5111 void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {

5112 void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,

5113 ZoneList<CharacterRange>* ranges) {

5114 #ifdef V8_I18N_SUPPORT	5112 #ifdef V8_I18N_SUPPORT

5115 // Use ICU to compute the case fold closure over the ranges.	5113 // Use ICU to compute the case fold closure over the ranges.

5116 DCHECK(compiler->unicode());

5117 DCHECK(compiler->ignore_case());

5118 icu::UnicodeSet set;	5114 icu::UnicodeSet set;

5119 for (int i = 0; i < ranges->length(); i++) {	5115 for (int i = 0; i < ranges->length(); i++) {

5120 set.add(ranges->at(i).from(), ranges->at(i).to());	5116 set.add(ranges->at(i).from(), ranges->at(i).to());

5121 }	5117 }

5122 ranges->Clear();	5118 ranges->Clear();

5123 set.closeOver(USET_CASE_INSENSITIVE);	5119 set.closeOver(USET_CASE_INSENSITIVE);

5124 // Full case mapping map single characters to multiple characters.	5120 // Full case mapping map single characters to multiple characters.

5125 // Those are represented as strings in the set. Remove them so that	5121 // Those are represented as strings in the set. Remove them so that

5126 // we end up with only simple and common case mappings.	5122 // we end up with only simple and common case mappings.

5127 set.removeAllStrings();	5123 set.removeAllStrings();

5128 Zone* zone = compiler->zone();

5129 for (int i = 0; i < set.getRangeCount(); i++) {	5124 for (int i = 0; i < set.getRangeCount(); i++) {

5130 ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),	5125 ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),

5131 zone);	5126 zone);

5132 }	5127 }

5133 // No errors and everything we collected have been ranges.	5128 // No errors and everything we collected have been ranges.

5134 #else	5129 CharacterRange::Canonicalize(ranges);

5135 // Fallback if ICU is not included.

5136 CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),

5137 ranges, compiler->one_byte());

5138 #endif // V8_I18N_SUPPORT	5130 #endif // V8_I18N_SUPPORT

5139 CharacterRange::Canonicalize(ranges);

5140 }	5131 }

5141	5132

5142	5133

5143 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,	5134 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,

5144 RegExpNode* on_success) {	5135 RegExpNode* on_success) {

5145 set_.Canonicalize();	5136 set_.Canonicalize();

5146 Zone* zone = compiler->zone();	5137 Zone* zone = compiler->zone();

5147 ZoneList<CharacterRange>* ranges = this->ranges(zone);	5138 ZoneList<CharacterRange>* ranges = this->ranges(zone);

5148 if (compiler->unicode() && compiler->ignore_case()) {	5139 if (compiler->unicode() && compiler->ignore_case()) {

5149 AddUnicodeCaseEquivalents(compiler, ranges);	5140 AddUnicodeCaseEquivalents(ranges, zone);

5150 }	5141 }

5151 if (compiler->unicode() && !compiler->one_byte()) {	5142 if (compiler->unicode() && !compiler->one_byte()) {

5152 if (is_negated()) {	5143 if (is_negated()) {

5153 ZoneList<CharacterRange>* negated =	5144 ZoneList<CharacterRange>* negated =

5154 new (zone) ZoneList<CharacterRange>(2, zone);	5145 new (zone) ZoneList<CharacterRange>(2, zone);

5155 CharacterRange::Negate(ranges, negated, zone);	5146 CharacterRange::Negate(ranges, negated, zone);

5156 ranges = negated;	5147 ranges = negated;

5157 }	5148 }

5158 if (ranges->length() == 0) {	5149 if (ranges->length() == 0) {

5159 ranges->Add(CharacterRange::Everything(), zone);	5150 ranges->Add(CharacterRange::Everything(), zone);

(...skipping 452 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5612 center->AddContinueAlternative(rest_alt);	5603 center->AddContinueAlternative(rest_alt);

5613 center->AddLoopAlternative(body_alt);	5604 center->AddLoopAlternative(body_alt);

5614 }	5605 }

5615 if (needs_counter) {	5606 if (needs_counter) {

5616 return ActionNode::SetRegister(reg_ctr, 0, center);	5607 return ActionNode::SetRegister(reg_ctr, 0, center);

5617 } else {	5608 } else {

5618 return center;	5609 return center;

5619 }	5610 }

5620 }	5611 }

5621	5612

	5613 namespace {

	5614 // Desugar \b to (?<=\w)(?=\W)\|(?<=\W)(?=\w) and

	5615 // \B to (?<=\W)(?=\W)\|(?<=\w)(?=\w)
	jgruber 2017/02/28 13:44:56 Nit: Please swap the group order of \B to make the Nit: Please swap the group order of \B to make the loop below easier to understand (i == 0 in the first column, i == 1 in the second). Yang 2017/02/28 14:26:11 Done. Show quoted text On 2017/02/28 13:44:56, jgruber wrote: > Nit: Please swap the group order of \B to make the loop below easier to > understand (i == 0 in the first column, i == 1 in the second). Done.
	5616 RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,

	5617 RegExpNode* on_success,

	5618 RegExpAssertion::AssertionType type) {

	5619 DCHECK(compiler->unicode() && compiler->ignore_case());

	5620 Zone* zone = compiler->zone();

	5621 ZoneList<CharacterRange>* word_range =

	5622 new (zone) ZoneList<CharacterRange>(2, zone);

	5623 CharacterRange::AddClassEscape('w', word_range, true, zone);

	5624 int stack_register = compiler->UnicodeLookaroundStackRegister();

	5625 int position_register = compiler->UnicodeLookaroundPositionRegister();

	5626 ChoiceNode* result = new (zone) ChoiceNode(2, zone);

	5627 // Add two choices. The (non-)boundary could start with a word or

	5628 // a non-word-character.

	5629 for (int i = 0; i < 2; i++) {

	5630 bool lookbehind_for_word = i == 0;

	5631 bool lookahead_for_word =

	5632 (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;

	5633 // Look to the left.

	5634 RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,

	5635 stack_register, position_register);

	5636 RegExpNode* backward = TextNode::CreateForCharacterRanges(

	5637 zone, word_range, true, lookbehind.on_match_success());

	5638 // Look to the right.

	5639 RegExpLookaround::Builder lookahead(lookahead_for_word,

	5640 lookbehind.ForMatch(backward),

	5641 stack_register, position_register);

	5642 RegExpNode* forward = TextNode::CreateForCharacterRanges(

	5643 zone, word_range, false, lookahead.on_match_success());

	5644 result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));

	5645 }

	5646 return result;

	5647 }

	5648 } // anonymous namespace

5622	5649

5623 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,	5650 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,

5624 RegExpNode* on_success) {	5651 RegExpNode* on_success) {

5625 NodeInfo info;	5652 NodeInfo info;

5626 Zone* zone = compiler->zone();	5653 Zone* zone = compiler->zone();

5627	5654

5628 switch (assertion_type()) {	5655 switch (assertion_type()) {

5629 case START_OF_LINE:	5656 case START_OF_LINE:

5630 return AssertionNode::AfterNewline(on_success);	5657 return AssertionNode::AfterNewline(on_success);

5631 case START_OF_INPUT:	5658 case START_OF_INPUT:

5632 return AssertionNode::AtStart(on_success);	5659 return AssertionNode::AtStart(on_success);

5633 case BOUNDARY:	5660 case BOUNDARY:

5634 return AssertionNode::AtBoundary(on_success);	5661 return compiler->unicode() && compiler->ignore_case()
	jgruber 2017/02/28 13:44:56 WDYT about adding compiler->needs_unicode_case_equ WDYT about adding compiler->needs_unicode_case_equivalents() (or similar) with short documentation about what's going on? Yang 2017/02/28 14:26:11 Done. Show quoted text On 2017/02/28 13:44:56, jgruber wrote: > WDYT about adding compiler->needs_unicode_case_equivalents() (or similar) with > short documentation about what's going on? Done.
	5662 ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY)

	5663 : AssertionNode::AtBoundary(on_success);

5635 case NON_BOUNDARY:	5664 case NON_BOUNDARY:

5636 return AssertionNode::AtNonBoundary(on_success);	5665 return compiler->unicode() && compiler->ignore_case()

	5666 ? BoundaryAssertionAsLookaround(compiler, on_success,

	5667 NON_BOUNDARY)

	5668 : AssertionNode::AtNonBoundary(on_success);

5637 case END_OF_INPUT:	5669 case END_OF_INPUT:

5638 return AssertionNode::AtEnd(on_success);	5670 return AssertionNode::AtEnd(on_success);

5639 case END_OF_LINE: {	5671 case END_OF_LINE: {

5640 // Compile $ in multiline regexps as an alternation with a positive	5672 // Compile $ in multiline regexps as an alternation with a positive

5641 // lookahead in one side and an end-of-input on the other side.	5673 // lookahead in one side and an end-of-input on the other side.

5642 // We need two registers for the lookahead.	5674 // We need two registers for the lookahead.

5643 int stack_pointer_register = compiler->AllocateRegister();	5675 int stack_pointer_register = compiler->AllocateRegister();

5644 int position_register = compiler->AllocateRegister();	5676 int position_register = compiler->AllocateRegister();

5645 // The ChoiceNode to distinguish between a newline and end-of-input.	5677 // The ChoiceNode to distinguish between a newline and end-of-input.

5646 ChoiceNode* result = new(zone) ChoiceNode(2, zone);	5678 ChoiceNode* result = new(zone) ChoiceNode(2, zone);

5647 // Create a newline atom.	5679 // Create a newline atom.

5648 ZoneList<CharacterRange>* newline_ranges =	5680 ZoneList<CharacterRange>* newline_ranges =

5649 new(zone) ZoneList<CharacterRange>(3, zone);	5681 new(zone) ZoneList<CharacterRange>(3, zone);

5650 CharacterRange::AddClassEscape('n', newline_ranges, zone);	5682 CharacterRange::AddClassEscape('n', newline_ranges, false, zone);

5651 RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');	5683 RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');

5652 TextNode* newline_matcher = new (zone) TextNode(	5684 TextNode* newline_matcher = new (zone) TextNode(

5653 newline_atom, false, ActionNode::PositiveSubmatchSuccess(	5685 newline_atom, false, ActionNode::PositiveSubmatchSuccess(

5654 stack_pointer_register, position_register,	5686 stack_pointer_register, position_register,

5655 0, // No captures inside.	5687 0, // No captures inside.

5656 -1, // Ignored if no captures.	5688 -1, // Ignored if no captures.

5657 on_success));	5689 on_success));

5658 // Create an end-of-input matcher.	5690 // Create an end-of-input matcher.

5659 RegExpNode* end_of_line = ActionNode::BeginSubmatch(	5691 RegExpNode* end_of_line = ActionNode::BeginSubmatch(

5660 stack_pointer_register,	5692 stack_pointer_register,

(...skipping 153 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5814 uc16 last = 0x0000;	5846 uc16 last = 0x0000;

5815 for (int i = 0; i < elmc; i += 2) {	5847 for (int i = 0; i < elmc; i += 2) {

5816 DCHECK(last <= elmv[i] - 1);	5848 DCHECK(last <= elmv[i] - 1);

5817 DCHECK(elmv[i] < elmv[i + 1]);	5849 DCHECK(elmv[i] < elmv[i + 1]);

5818 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);	5850 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);

5819 last = elmv[i + 1];	5851 last = elmv[i + 1];

5820 }	5852 }

5821 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);	5853 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);

5822 }	5854 }

5823	5855

	5856 void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,

	5857 bool add_unicode_case_equivalents,

	5858 Zone* zone) {

	5859 if (add_unicode_case_equivalents && (type == 'w' \|\| type == 'W')) {

	5860 // See #sec-runtime-semantics-wordcharacters-abstract-operation

	5861 // In case of unicode and ignore_case, we need to create the closure over

	5862 // case equivalent characters before negating.

	5863 ZoneList<CharacterRange>* new_ranges =

	5864 new (zone) ZoneList<CharacterRange>(2, zone);

	5865 AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);

	5866 AddUnicodeCaseEquivalents(new_ranges, zone);

	5867 if (type == 'W') {

	5868 ZoneList<CharacterRange>* negated =

	5869 new (zone) ZoneList<CharacterRange>(2, zone);

	5870 CharacterRange::Negate(new_ranges, negated, zone);

	5871 new_ranges = negated;

	5872 }

	5873 ranges->AddAll(*new_ranges, zone);

	5874 return;

	5875 }

5824	5876

5825 void CharacterRange::AddClassEscape(uc16 type,

5826 ZoneList<CharacterRange>* ranges,

5827 Zone* zone) {

5828 switch (type) {	5877 switch (type) {

5829 case 's':	5878 case 's':

5830 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);	5879 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);

5831 break;	5880 break;

5832 case 'S':	5881 case 'S':

5833 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);	5882 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);

5834 break;	5883 break;

5835 case 'w':	5884 case 'w':

5836 AddClass(kWordRanges, kWordRangeCount, ranges, zone);	5885 AddClass(kWordRanges, kWordRangeCount, ranges, zone);

5837 break;	5886 break;

(...skipping 120 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5958 if (next_range.from() <= max + 1) return false;	6007 if (next_range.from() <= max + 1) return false;

5959 max = next_range.to();	6008 max = next_range.to();

5960 }	6009 }

5961 return true;	6010 return true;

5962 }	6011 }

5963	6012

5964	6013

5965 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {	6014 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {

5966 if (ranges_ == NULL) {	6015 if (ranges_ == NULL) {

5967 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);	6016 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);

5968 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);	6017 CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);

5969 }	6018 }

5970 return ranges_;	6019 return ranges_;

5971 }	6020 }

5972	6021

5973	6022

5974 // Move a number of elements in a zonelist to another position	6023 // Move a number of elements in a zonelist to another position

5975 // in the same list. Handles overlapping source and target areas.	6024 // in the same list. Handles overlapping source and target areas.

5976 static void MoveRanges(ZoneList<CharacterRange>* list,	6025 static void MoveRanges(ZoneList<CharacterRange>* list,

5977 int from,	6026 int from,

5978 int to,	6027 int to,

(...skipping 885 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6864	6913

6865	6914

6866 void RegExpResultsCache::Clear(FixedArray* cache) {	6915 void RegExpResultsCache::Clear(FixedArray* cache) {

6867 for (int i = 0; i < kRegExpResultsCacheSize; i++) {	6916 for (int i = 0; i < kRegExpResultsCacheSize; i++) {

6868 cache->set(i, Smi::kZero);	6917 cache->set(i, Smi::kZero);

6869 }	6918 }

6870 }	6919 }

6871	6920

6872 } // namespace internal	6921 } // namespace internal

6873 } // namespace v8	6922 } // namespace v8

OLD	NEW

« no previous file with comments | « no previous file | src/regexp/regexp-ast.h » ('j') | src/regexp/regexp-ast.h » ('J')