Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(289)

Unified Diff: src/regexp/jsregexp.cc

Issue 2725583002: [regexp] fix /\W/ui wrt \u017f and \u212a. (Closed)
Patch Set: address comments Created 3 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | src/regexp/regexp-ast.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/regexp/jsregexp.cc
diff --git a/src/regexp/jsregexp.cc b/src/regexp/jsregexp.cc
index fbbd3ecda29efcda0e3e72193d9bb39a2dd2e3a3..4e33e4ac7807849d1b137675bb400d88acf9626b 100644
--- a/src/regexp/jsregexp.cc
+++ b/src/regexp/jsregexp.cc
@@ -1019,6 +1019,11 @@ class RegExpCompiler {
inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
+ // Both unicode and ignore_case flags are set. We need to use ICU to find
+ // the closure over case equivalents.
+ inline bool needs_unicode_case_equivalents() {
+ return unicode() && ignore_case();
+ }
inline bool one_byte() { return one_byte_; }
inline bool optimize() { return optimize_; }
inline void set_optimize(bool value) { optimize_ = value; }
@@ -5108,13 +5113,9 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
}
-
-void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
- ZoneList<CharacterRange>* ranges) {
+void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
#ifdef V8_I18N_SUPPORT
// Use ICU to compute the case fold closure over the ranges.
- DCHECK(compiler->unicode());
- DCHECK(compiler->ignore_case());
icu::UnicodeSet set;
for (int i = 0; i < ranges->length(); i++) {
set.add(ranges->at(i).from(), ranges->at(i).to());
@@ -5125,18 +5126,13 @@ void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
// Those are represented as strings in the set. Remove them so that
// we end up with only simple and common case mappings.
set.removeAllStrings();
- Zone* zone = compiler->zone();
for (int i = 0; i < set.getRangeCount(); i++) {
ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
zone);
}
// No errors and everything we collected have been ranges.
-#else
- // Fallback if ICU is not included.
- CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
- ranges, compiler->one_byte());
-#endif // V8_I18N_SUPPORT
CharacterRange::Canonicalize(ranges);
+#endif // V8_I18N_SUPPORT
}
@@ -5145,8 +5141,8 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
set_.Canonicalize();
Zone* zone = compiler->zone();
ZoneList<CharacterRange>* ranges = this->ranges(zone);
- if (compiler->unicode() && compiler->ignore_case()) {
- AddUnicodeCaseEquivalents(compiler, ranges);
+ if (compiler->needs_unicode_case_equivalents()) {
+ AddUnicodeCaseEquivalents(ranges, zone);
}
if (compiler->unicode() && !compiler->one_byte()) {
if (is_negated()) {
@@ -5619,6 +5615,42 @@ RegExpNode* RegExpQuantifier::ToNode(int min,
}
}
+namespace {
+// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
+// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
+RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ RegExpAssertion::AssertionType type) {
+ DCHECK(compiler->needs_unicode_case_equivalents());
+ Zone* zone = compiler->zone();
+ ZoneList<CharacterRange>* word_range =
+ new (zone) ZoneList<CharacterRange>(2, zone);
+ CharacterRange::AddClassEscape('w', word_range, true, zone);
+ int stack_register = compiler->UnicodeLookaroundStackRegister();
+ int position_register = compiler->UnicodeLookaroundPositionRegister();
+ ChoiceNode* result = new (zone) ChoiceNode(2, zone);
+ // Add two choices. The (non-)boundary could start with a word or
+ // a non-word-character.
+ for (int i = 0; i < 2; i++) {
+ bool lookbehind_for_word = i == 0;
+ bool lookahead_for_word =
+ (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
+ // Look to the left.
+ RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
+ stack_register, position_register);
+ RegExpNode* backward = TextNode::CreateForCharacterRanges(
+ zone, word_range, true, lookbehind.on_match_success());
+ // Look to the right.
+ RegExpLookaround::Builder lookahead(lookahead_for_word,
+ lookbehind.ForMatch(backward),
+ stack_register, position_register);
+ RegExpNode* forward = TextNode::CreateForCharacterRanges(
+ zone, word_range, false, lookahead.on_match_success());
+ result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
+ }
+ return result;
+}
+} // anonymous namespace
RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
@@ -5631,9 +5663,14 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
case START_OF_INPUT:
return AssertionNode::AtStart(on_success);
case BOUNDARY:
- return AssertionNode::AtBoundary(on_success);
+ return compiler->needs_unicode_case_equivalents()
+ ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY)
+ : AssertionNode::AtBoundary(on_success);
case NON_BOUNDARY:
- return AssertionNode::AtNonBoundary(on_success);
+ return compiler->needs_unicode_case_equivalents()
+ ? BoundaryAssertionAsLookaround(compiler, on_success,
+ NON_BOUNDARY)
+ : AssertionNode::AtNonBoundary(on_success);
case END_OF_INPUT:
return AssertionNode::AtEnd(on_success);
case END_OF_LINE: {
@@ -5647,7 +5684,7 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
// Create a newline atom.
ZoneList<CharacterRange>* newline_ranges =
new(zone) ZoneList<CharacterRange>(3, zone);
- CharacterRange::AddClassEscape('n', newline_ranges, zone);
+ CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
TextNode* newline_matcher = new (zone) TextNode(
newline_atom, false, ActionNode::PositiveSubmatchSuccess(
@@ -5821,9 +5858,30 @@ static void AddClassNegated(const int *elmv,
ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
}
+void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
+ bool add_unicode_case_equivalents,
+ Zone* zone) {
+ if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
+ // See #sec-runtime-semantics-wordcharacters-abstract-operation
+ // In case of unicode and ignore_case, we need to create the closure over
+ // case equivalent characters before negating.
+ ZoneList<CharacterRange>* new_ranges =
+ new (zone) ZoneList<CharacterRange>(2, zone);
+ AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
+ AddUnicodeCaseEquivalents(new_ranges, zone);
+ if (type == 'W') {
+ ZoneList<CharacterRange>* negated =
+ new (zone) ZoneList<CharacterRange>(2, zone);
+ CharacterRange::Negate(new_ranges, negated, zone);
+ new_ranges = negated;
+ }
+ ranges->AddAll(*new_ranges, zone);
+ return;
+ }
+ AddClassEscape(type, ranges, zone);
+}
-void CharacterRange::AddClassEscape(uc16 type,
- ZoneList<CharacterRange>* ranges,
+void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
Zone* zone) {
switch (type) {
case 's':
@@ -5965,7 +6023,7 @@ bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
if (ranges_ == NULL) {
ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
- CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
+ CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
}
return ranges_;
}
« no previous file with comments | « no previous file | src/regexp/regexp-ast.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698