Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(147)

Unified Diff: src/regexp/regexp-parser.cc

Issue 1578253005: [regexp] implement character classes for unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: more tests Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/regexp/regexp-parser.cc
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc
index fa8900342cfc4878411a1c06d753254024f138fe..07d5779675786b0dfbec11fb7a8cf8fa19f3aecb 100644
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
@@ -15,20 +15,18 @@ namespace v8 {
namespace internal {
RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
- bool multiline, bool unicode, Isolate* isolate,
- Zone* zone)
+ JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
: isolate_(isolate),
zone_(zone),
error_(error),
captures_(NULL),
in_(in),
current_(kEndMarker),
+ flags_(flags),
next_pos_(0),
captures_started_(0),
capture_count_(0),
has_more_(true),
- multiline_(multiline),
- unicode_(unicode),
simple_(false),
contains_anchor_(false),
is_scanned_for_captures_(false),
@@ -37,9 +35,28 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
}
+template <bool update_position>
+uc32 RegExpParser::ReadNext() {
+ int position = next_pos_;
+ uc32 c0 = in()->Get(position);
+ position++;
+ // Read the whole surrogate pair in case of unicode flag, if possible.
+ if (unicode() && position < in()->length() &&
+ unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
+ uc16 c1 = in()->Get(position);
+ if (unibrow::Utf16::IsTrailSurrogate(c1)) {
+ c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
+ position++;
+ }
+ }
+ if (update_position) next_pos_ = position;
+ return c0;
+}
+
+
uc32 RegExpParser::Next() {
if (has_next()) {
- return in()->Get(next_pos_);
+ return ReadNext<false>();
} else {
return kEndMarker;
}
@@ -47,25 +64,14 @@ uc32 RegExpParser::Next() {
void RegExpParser::Advance() {
- if (next_pos_ < in()->length()) {
+ if (has_next()) {
StackLimitCheck check(isolate());
if (check.HasOverflowed()) {
ReportError(CStrVector(Isolate::kStackOverflowMessage));
} else if (zone()->excess_allocation()) {
ReportError(CStrVector("Regular expression too large"));
} else {
- current_ = in()->Get(next_pos_);
- next_pos_++;
- // Read the whole surrogate pair in case of unicode flag, if possible.
- if (unicode_ && next_pos_ < in()->length() &&
- unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) {
- uc16 trail = in()->Get(next_pos_);
- if (unibrow::Utf16::IsTrailSurrogate(trail)) {
- current_ = unibrow::Utf16::CombineSurrogatePair(
- static_cast<uc16>(current_), trail);
- next_pos_++;
- }
- }
+ current_ = ReadNext<true>();
}
} else {
current_ = kEndMarker;
@@ -142,7 +148,7 @@ RegExpTree* RegExpParser::ParsePattern() {
RegExpTree* RegExpParser::ParseDisjunction() {
// Used to store current state while parsing subexpressions.
RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
- zone());
+ flags_, zone());
RegExpParserState* state = &initial_state;
// Cache the builder in a local variable for quick access.
RegExpBuilder* builder = initial_state.builder();
@@ -206,7 +212,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
return ReportError(CStrVector("Nothing to repeat"));
case '^': {
Advance();
- if (multiline_) {
+ if (multiline()) {
builder->AddAssertion(
new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
} else {
@@ -219,8 +225,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '$': {
Advance();
RegExpAssertion::AssertionType assertion_type =
- multiline_ ? RegExpAssertion::END_OF_LINE
- : RegExpAssertion::END_OF_INPUT;
+ multiline() ? RegExpAssertion::END_OF_LINE
+ : RegExpAssertion::END_OF_INPUT;
builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
continue;
}
@@ -230,8 +236,9 @@ RegExpTree* RegExpParser::ParseDisjunction() {
ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone());
CharacterRange::AddClassEscape('.', ranges, zone());
- RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
- builder->AddAtom(atom);
+ RegExpCharacterClass* cc =
+ new (zone()) RegExpCharacterClass(ranges, false);
+ builder->AddCharacterClass(cc);
break;
}
case '(': {
@@ -276,14 +283,15 @@ RegExpTree* RegExpParser::ParseDisjunction() {
captures_started_++;
}
// Store current state and begin new disjunction parsing.
- state = new (zone()) RegExpParserState(
- state, subexpr_type, lookaround_type, captures_started_, zone());
+ state =
+ new (zone()) RegExpParserState(state, subexpr_type, lookaround_type,
+ captures_started_, flags_, zone());
builder = state->builder();
continue;
}
case '[': {
- RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);
- builder->AddAtom(atom);
+ RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
+ builder->AddCharacterClass(cc->AsCharacterClass());
break;
}
// Atom ::
@@ -318,8 +326,9 @@ RegExpTree* RegExpParser::ParseDisjunction() {
ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone());
CharacterRange::AddClassEscape(c, ranges, zone());
- RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
- builder->AddAtom(atom);
+ RegExpCharacterClass* cc =
+ new (zone()) RegExpCharacterClass(ranges, false);
+ builder->AddCharacterClass(cc);
break;
}
case '1':
@@ -353,7 +362,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
// escaped,
// no other identity escapes are allowed. If the 'u' flag is not
// present, all identity escapes are allowed.
- if (!unicode_) {
+ if (!unicode()) {
builder->AddCharacter(first_digit);
Advance(2);
} else {
@@ -414,7 +423,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
uc32 value;
if (ParseHexEscape(2, &value)) {
builder->AddCharacter(value);
- } else if (!unicode_) {
+ } else if (!unicode()) {
builder->AddCharacter('x');
} else {
// If the 'u' flag is present, invalid escapes are not treated as
@@ -428,7 +437,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
uc32 value;
if (ParseUnicodeEscape(&value)) {
builder->AddUnicodeCharacter(value);
- } else if (!unicode_) {
+ } else if (!unicode()) {
builder->AddCharacter('u');
} else {
// If the 'u' flag is present, invalid escapes are not treated as
@@ -444,7 +453,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
// other identity escapes are allowed. If the 'u' flag is not
// present,
// all identity escapes are allowed.
- if (!unicode_ || IsSyntaxCharacter(current())) {
+ if (!unicode() || IsSyntaxCharacter(current())) {
builder->AddCharacter(current());
Advance();
} else {
@@ -745,7 +754,7 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) {
// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
// allowed). In the latter case, the number of hex digits between { } is
// arbitrary. \ and u have already been read.
- if (current() == '{' && unicode_) {
+ if (current() == '{' && unicode()) {
int start = position();
Advance();
if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
@@ -840,7 +849,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
if (ParseHexEscape(2, &value)) {
return value;
}
- if (!unicode_) {
+ if (!unicode()) {
// If \x is not followed by a two-digit hexadecimal, treat it
// as an identity escape.
return 'x';
@@ -856,7 +865,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
if (ParseUnicodeEscape(&value)) {
return value;
}
- if (!unicode_) {
+ if (!unicode()) {
return 'u';
}
// If the 'u' flag is present, invalid escapes are not treated as
@@ -869,7 +878,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
// If the 'u' flag is present, only syntax characters can be escaped, no
// other identity escapes are allowed. If the 'u' flag is not present, all
// identity escapes are allowed.
- if (!unicode_ || IsSyntaxCharacter(result)) {
+ if (!unicode() || IsSyntaxCharacter(result)) {
Advance();
return result;
}
@@ -899,13 +908,29 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
case kEndMarker:
return ReportError(CStrVector("\\ at end of pattern"));
default:
- uc32 c = ParseClassCharacterEscape(CHECK_FAILED);
- return CharacterRange::Singleton(c);
+ first = ParseClassCharacterEscape(CHECK_FAILED);
}
} else {
Advance();
- return CharacterRange::Singleton(first);
}
+
+ if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {
+ // Combine with possibly following trail surrogate.
+ int start = position();
+ uc32 second = current();
+ if (second == '\\') {
+ second = ParseClassCharacterEscape(CHECK_FAILED);
+ } else {
+ Advance();
+ }
+ if (unibrow::Utf16::IsTrailSurrogate(second)) {
+ first = unibrow::Utf16::CombineSurrogatePair(first, second);
+ } else {
+ Reset(start);
+ }
+ }
+
+ return CharacterRange::Singleton(first);
}
@@ -985,10 +1010,10 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
- FlatStringReader* input, bool multiline,
- bool unicode, RegExpCompileData* result) {
+ FlatStringReader* input, JSRegExp::Flags flags,
+ RegExpCompileData* result) {
DCHECK(result != NULL);
- RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);
+ RegExpParser parser(input, &result->error, flags, isolate, zone);
RegExpTree* tree = parser.ParsePattern();
if (parser.failed()) {
DCHECK(tree == NULL);
@@ -1011,10 +1036,12 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
}
-RegExpBuilder::RegExpBuilder(Zone* zone)
+RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
: zone_(zone),
pending_empty_(false),
+ flags_(flags),
characters_(NULL),
+ pending_surrogate_(kNoPendingSurrogate),
terms_(),
alternatives_()
#ifdef DEBUG
@@ -1025,7 +1052,48 @@ RegExpBuilder::RegExpBuilder(Zone* zone)
}
+void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
+ DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
+ FlushPendingSurrogate();
+ // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
+ pending_surrogate_ = lead_surrogate;
+}
+
+
+void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
+ DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
+ if (pending_surrogate_ != kNoPendingSurrogate) {
+ uc16 lead_surrogate = pending_surrogate_;
+ DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
+ ZoneList<uc16> surrogate_pair(2, zone());
+ surrogate_pair.Add(lead_surrogate, zone());
+ surrogate_pair.Add(trail_surrogate, zone());
+ RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
+ pending_surrogate_ = kNoPendingSurrogate;
+ AddAtom(atom);
+ } else {
+ pending_surrogate_ = trail_surrogate;
+ FlushPendingSurrogate();
+ }
+}
+
+
+void RegExpBuilder::FlushPendingSurrogate() {
+ if (pending_surrogate_ != kNoPendingSurrogate) {
+ // Use character class to desugar lone surrogate matching.
+ RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(
+ CharacterRange::List(zone(),
+ CharacterRange::Singleton(pending_surrogate_)),
+ false);
+ pending_surrogate_ = kNoPendingSurrogate;
+ DCHECK(unicode());
+ AddCharacterClass(cc);
+ }
+}
+
+
void RegExpBuilder::FlushCharacters() {
+ FlushPendingSurrogate();
pending_empty_ = false;
if (characters_ != NULL) {
RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
@@ -1053,6 +1121,7 @@ void RegExpBuilder::FlushText() {
void RegExpBuilder::AddCharacter(uc16 c) {
+ FlushPendingSurrogate();
pending_empty_ = false;
if (characters_ == NULL) {
characters_ = new (zone()) ZoneList<uc16>(4, zone());
@@ -1064,11 +1133,13 @@ void RegExpBuilder::AddCharacter(uc16 c) {
void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
- ZoneList<uc16> surrogate_pair(2, zone());
- surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone());
- surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone());
- RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
- AddAtom(atom);
+ DCHECK(unicode());
+ AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
+ AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
+ } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
+ AddLeadSurrogate(c);
+ } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
+ AddTrailSurrogate(c);
} else {
AddCharacter(static_cast<uc16>(c));
}
@@ -1078,6 +1149,17 @@ void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
+void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
+ if (unicode() && cc->NeedsDesugaringForUnicode(zone())) {
+ // In unicode mode, character class needs to be desugared, so it
+ // must be a standalone term instead of being part of a RegExpText.
+ AddTerm(cc);
+ } else {
+ AddAtom(cc);
+ }
+}
+
+
void RegExpBuilder::AddAtom(RegExpTree* term) {
if (term->IsEmpty()) {
AddEmpty();
@@ -1094,6 +1176,13 @@ void RegExpBuilder::AddAtom(RegExpTree* term) {
}
+void RegExpBuilder::AddTerm(RegExpTree* term) {
+ FlushText();
+ terms_.Add(term, zone());
+ LAST(ADD_ATOM);
+}
+
+
void RegExpBuilder::AddAssertion(RegExpTree* assert) {
FlushText();
terms_.Add(assert, zone());
@@ -1132,6 +1221,7 @@ RegExpTree* RegExpBuilder::ToRegExp() {
void RegExpBuilder::AddQuantifierToAtom(
int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
+ FlushPendingSurrogate();
if (pending_empty_) {
pending_empty_ = false;
return;
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698