Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(344)

Side by Side Diff: src/regexp/jsregexp.cc

Issue 1608693003: [regexp] step back if starting unicode regexp within surrogate pair. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@unicodeclass
Patch Set: rebase Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | src/regexp/regexp-ast.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/jsregexp.h" 5 #include "src/regexp/jsregexp.h"
6 6
7 #include "src/ast/ast.h" 7 #include "src/ast/ast.h"
8 #include "src/base/platform/platform.h" 8 #include "src/base/platform/platform.h"
9 #include "src/compilation-cache.h" 9 #include "src/compilation-cache.h"
10 #include "src/compiler.h" 10 #include "src/compiler.h"
(...skipping 5014 matching lines...) Expand 10 before | Expand all | Expand 10 after
5025 int stack_register = compiler->UnicodeLookaroundStackRegister(); 5025 int stack_register = compiler->UnicodeLookaroundStackRegister();
5026 int position_register = compiler->UnicodeLookaroundPositionRegister(); 5026 int position_register = compiler->UnicodeLookaroundPositionRegister();
5027 RegExpLookaround::Builder lookaround(false, on_success, stack_register, 5027 RegExpLookaround::Builder lookaround(false, on_success, stack_register,
5028 position_register); 5028 position_register);
5029 RegExpNode* negative_match = TextNode::CreateForCharacterRanges( 5029 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5030 zone, lookahead, read_backward, lookaround.on_match_success()); 5030 zone, lookahead, read_backward, lookaround.on_match_success());
5031 return TextNode::CreateForCharacterRanges( 5031 return TextNode::CreateForCharacterRanges(
5032 zone, match, read_backward, lookaround.ForMatch(negative_match)); 5032 zone, match, read_backward, lookaround.ForMatch(negative_match));
5033 } 5033 }
5034 5034
5035
erikcorry 2016/01/22 10:10:10 Inadvertent edit?
Yang 2016/01/25 07:38:41 Undone.
5036 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 5035 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5037 RegExpNode* on_success, 5036 RegExpNode* on_success,
5038 UnicodeRangeSplitter* splitter) { 5037 UnicodeRangeSplitter* splitter) {
5039 ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates(); 5038 ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
5040 if (lead_surrogates == nullptr) return; 5039 if (lead_surrogates == nullptr) return;
5041 Zone* zone = compiler->zone(); 5040 Zone* zone = compiler->zone();
5042 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]). 5041 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
5043 ZoneList<CharacterRange>* trail_surrogates = 5042 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
5044 new (zone) ZoneList<CharacterRange>(1, zone); 5043 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
5045 trail_surrogates->Add(
5046 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), zone);
5047 5044
5048 RegExpNode* match = 5045 RegExpNode* match =
5049 compiler->read_backward() 5046 compiler->read_backward()
5050 // Reading backward. Assert that reading forward, there is no trail 5047 // Reading backward. Assert that reading forward, there is no trail
5051 // surrogate, and then backward match the lead surrogate. 5048 // surrogate, and then backward match the lead surrogate.
5052 ? NegativeLookaroundAgainstReadDirectionAndMatch( 5049 ? NegativeLookaroundAgainstReadDirectionAndMatch(
5053 compiler, trail_surrogates, lead_surrogates, on_success, true) 5050 compiler, trail_surrogates, lead_surrogates, on_success, true)
5054 // Reading forward. Forwrad match the lead surrogate and assert that 5051 // Reading forward. Forwrad match the lead surrogate and assert that
5055 // no 5052 // no
5056 // trail surrogate follows. 5053 // trail surrogate follows.
5057 : MatchAndNegativeLookaroundInReadDirection( 5054 : MatchAndNegativeLookaroundInReadDirection(
5058 compiler, lead_surrogates, trail_surrogates, on_success, false); 5055 compiler, lead_surrogates, trail_surrogates, on_success, false);
5059 result->AddAlternative(GuardedAlternative(match)); 5056 result->AddAlternative(GuardedAlternative(match));
5060 } 5057 }
5061 5058
5062 5059
5063 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 5060 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5064 RegExpNode* on_success, 5061 RegExpNode* on_success,
5065 UnicodeRangeSplitter* splitter) { 5062 UnicodeRangeSplitter* splitter) {
5066 ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates(); 5063 ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
5067 if (trail_surrogates == nullptr) return; 5064 if (trail_surrogates == nullptr) return;
5068 Zone* zone = compiler->zone(); 5065 Zone* zone = compiler->zone();
5069 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01 5066 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
5070 ZoneList<CharacterRange>* lead_surrogates = 5067 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5071 new (zone) ZoneList<CharacterRange>(1, zone); 5068 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5072 lead_surrogates->Add(
5073 CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd), zone);
5074 5069
5075 RegExpNode* match = 5070 RegExpNode* match =
5076 compiler->read_backward() 5071 compiler->read_backward()
5077 // Reading backward. Backward match the trail surrogate and assert 5072 // Reading backward. Backward match the trail surrogate and assert
5078 // that no lead surrogate precedes it. 5073 // that no lead surrogate precedes it.
5079 ? MatchAndNegativeLookaroundInReadDirection( 5074 ? MatchAndNegativeLookaroundInReadDirection(
5080 compiler, trail_surrogates, lead_surrogates, on_success, true) 5075 compiler, trail_surrogates, lead_surrogates, on_success, true)
5081 // Reading forward. Assert that reading backward, there is no lead 5076 // Reading forward. Assert that reading backward, there is no lead
5082 // surrogate, and then forward match the trail surrogate. 5077 // surrogate, and then forward match the trail surrogate.
5083 : NegativeLookaroundAgainstReadDirectionAndMatch( 5078 : NegativeLookaroundAgainstReadDirectionAndMatch(
5084 compiler, lead_surrogates, trail_surrogates, on_success, false); 5079 compiler, lead_surrogates, trail_surrogates, on_success, false);
5085 result->AddAlternative(GuardedAlternative(match)); 5080 result->AddAlternative(GuardedAlternative(match));
5086 } 5081 }
5087 5082
5088 5083
5084 void AddUnanchoredAdvance(RegExpCompiler* compiler, ChoiceNode* result,
5085 RegExpNode* on_success) {
5086 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5087 DCHECK(!compiler->read_backward());
5088 Zone* zone = compiler->zone();
5089 // Advancing can either consume a BMP character or a trail surrogate.
5090 ZoneList<CharacterRange>* bmp_and_trail =
5091 new (zone) ZoneList<CharacterRange>(2, zone);
5092 bmp_and_trail->Add(CharacterRange::Range(0, kLeadSurrogateStart - 1), zone);
5093 bmp_and_trail->Add(
5094 CharacterRange::Range(kLeadSurrogateEnd + 1, kNonBmpStart - 1), zone);
5095 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
5096 zone, bmp_and_trail, false, on_success)));
5097
5098 // Or it could consume a lead optionally followed by a trail surrogate.
5099 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5100 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5101 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
5102 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
5103 ChoiceNode* optional_trail = new (zone) ChoiceNode(2, zone);
5104 optional_trail->AddAlternative(
5105 GuardedAlternative(TextNode::CreateForCharacterRanges(
5106 zone, trail_surrogates, false, on_success)));
5107 optional_trail->AddAlternative(GuardedAlternative(on_success));
5108 RegExpNode* optional_pair = TextNode::CreateForCharacterRanges(
5109 zone, lead_surrogates, false, optional_trail);
5110 result->AddAlternative(GuardedAlternative(optional_pair));
5111 }
5112
5113
5089 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 5114 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5090 RegExpNode* on_success) { 5115 RegExpNode* on_success) {
5091 set_.Canonicalize(); 5116 set_.Canonicalize();
5092 Zone* zone = compiler->zone(); 5117 Zone* zone = compiler->zone();
5093 ZoneList<CharacterRange>* ranges = this->ranges(zone); 5118 ZoneList<CharacterRange>* ranges = this->ranges(zone);
5094 if (compiler->unicode() && !compiler->one_byte()) { 5119 if (compiler->unicode() && !compiler->one_byte()) {
5095 if (is_negated()) { 5120 if (is_negated()) {
5096 ZoneList<CharacterRange>* negated = 5121 ZoneList<CharacterRange>* negated =
5097 new (zone) ZoneList<CharacterRange>(2, zone); 5122 new (zone) ZoneList<CharacterRange>(2, zone);
5098 CharacterRange::Negate(ranges, negated, zone); 5123 CharacterRange::Negate(ranges, negated, zone);
5099 ranges = negated; 5124 ranges = negated;
5100 } 5125 }
5101 if (ranges->length() == 0) { 5126 if (ranges->length() == 0) {
5102 // No matches possible. 5127 // No matches possible.
5103 return new (zone) EndNode(EndNode::BACKTRACK, zone); 5128 return new (zone) EndNode(EndNode::BACKTRACK, zone);
5104 } 5129 }
5105 UnicodeRangeSplitter splitter(zone, ranges); 5130 ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5106 ChoiceNode* result = new (compiler->zone()) ChoiceNode(2, compiler->zone()); 5131 if (standard_type() == '*') {
5107 AddBmpCharacters(compiler, result, on_success, &splitter); 5132 AddUnanchoredAdvance(compiler, result, on_success);
5108 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); 5133 } else {
5109 AddLoneLeadSurrogates(compiler, result, on_success, &splitter); 5134 UnicodeRangeSplitter splitter(zone, ranges);
5110 AddLoneTrailSurrogates(compiler, result, on_success, &splitter); 5135 AddBmpCharacters(compiler, result, on_success, &splitter);
5136 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5137 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5138 AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5139 }
5111 return result; 5140 return result;
5112 } else { 5141 } else {
5113 return new (zone) TextNode(this, compiler->read_backward(), on_success); 5142 return new (zone) TextNode(this, compiler->read_backward(), on_success);
5114 } 5143 }
5115 } 5144 }
5116 5145
5117 5146
5118 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { 5147 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5119 RegExpAtom* atom1 = (*a)->AsAtom(); 5148 RegExpAtom* atom1 = (*a)->AsAtom();
5120 RegExpAtom* atom2 = (*b)->AsAtom(); 5149 RegExpAtom* atom2 = (*b)->AsAtom();
(...skipping 1385 matching lines...) Expand 10 before | Expand all | Expand 10 after
6506 } 6535 }
6507 } 6536 }
6508 6537
6509 6538
6510 void DispatchTableConstructor::VisitAction(ActionNode* that) { 6539 void DispatchTableConstructor::VisitAction(ActionNode* that) {
6511 RegExpNode* target = that->on_success(); 6540 RegExpNode* target = that->on_success();
6512 target->Accept(this); 6541 target->Accept(this);
6513 } 6542 }
6514 6543
6515 6544
6545 RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6546 RegExpNode* on_success) {
6547 // If the regexp matching starts within a surrogate pair, step back
6548 // to the lead surrogate and start matching from there.
6549 DCHECK(!compiler->read_backward());
6550 Zone* zone = compiler->zone();
6551 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6552 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6553 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6554 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6555
6556 ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6557
6558 int stack_register = compiler->UnicodeLookaroundStackRegister();
6559 int position_register = compiler->UnicodeLookaroundPositionRegister();
6560 RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6561 zone, lead_surrogates, true, on_success);
6562 RegExpLookaround::Builder builder(true, step_back, stack_register,
6563 position_register);
6564 RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6565 zone, trail_surrogates, false, builder.on_match_success());
6566
6567 optional_step_back->AddAlternative(
6568 GuardedAlternative(builder.ForMatch(match_trail)));
6569 optional_step_back->AddAlternative(GuardedAlternative(on_success));
6570
6571 return optional_step_back;
6572 }
6573
6574
6516 RegExpEngine::CompilationResult RegExpEngine::Compile( 6575 RegExpEngine::CompilationResult RegExpEngine::Compile(
6517 Isolate* isolate, Zone* zone, RegExpCompileData* data, 6576 Isolate* isolate, Zone* zone, RegExpCompileData* data,
6518 JSRegExp::Flags flags, Handle<String> pattern, 6577 JSRegExp::Flags flags, Handle<String> pattern,
6519 Handle<String> sample_subject, bool is_one_byte) { 6578 Handle<String> sample_subject, bool is_one_byte) {
6520 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 6579 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6521 return IrregexpRegExpTooBig(isolate); 6580 return IrregexpRegExpTooBig(isolate);
6522 } 6581 }
6523 bool ignore_case = flags & JSRegExp::kIgnoreCase; 6582 bool ignore_case = flags & JSRegExp::kIgnoreCase;
6524 bool is_sticky = flags & JSRegExp::kSticky; 6583 bool is_sticky = flags & JSRegExp::kSticky;
6525 bool is_global = flags & JSRegExp::kGlobal; 6584 bool is_global = flags & JSRegExp::kGlobal;
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
6568 node = loop_node; 6627 node = loop_node;
6569 } 6628 }
6570 } 6629 }
6571 if (is_one_byte) { 6630 if (is_one_byte) {
6572 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6631 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6573 // Do it again to propagate the new nodes to places where they were not 6632 // Do it again to propagate the new nodes to places where they were not
6574 // put because they had not been calculated yet. 6633 // put because they had not been calculated yet.
6575 if (node != NULL) { 6634 if (node != NULL) {
6576 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6635 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6577 } 6636 }
6637 } else if (compiler.unicode()) {
6638 node = OptionallyStepBackToLeadSurrogate(&compiler, node);
6578 } 6639 }
6579 6640
6580 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); 6641 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6581 data->node = node; 6642 data->node = node;
6582 Analysis analysis(isolate, ignore_case, is_one_byte); 6643 Analysis analysis(isolate, ignore_case, is_one_byte);
6583 analysis.EnsureAnalyzed(node); 6644 analysis.EnsureAnalyzed(node);
6584 if (analysis.has_failed()) { 6645 if (analysis.has_failed()) {
6585 const char* error_message = analysis.error_message(); 6646 const char* error_message = analysis.error_message();
6586 return CompilationResult(isolate, error_message); 6647 return CompilationResult(isolate, error_message);
6587 } 6648 }
(...skipping 171 matching lines...) Expand 10 before | Expand all | Expand 10 after
6759 6820
6760 6821
6761 void RegExpResultsCache::Clear(FixedArray* cache) { 6822 void RegExpResultsCache::Clear(FixedArray* cache) {
6762 for (int i = 0; i < kRegExpResultsCacheSize; i++) { 6823 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6763 cache->set(i, Smi::FromInt(0)); 6824 cache->set(i, Smi::FromInt(0));
6764 } 6825 }
6765 } 6826 }
6766 6827
6767 } // namespace internal 6828 } // namespace internal
6768 } // namespace v8 6829 } // namespace v8
OLDNEW
« no previous file with comments | « no previous file | src/regexp/regexp-ast.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698