Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(101)

Unified Diff: src/regexp/regexp-parser.cc

Issue 1618753002: Revert of [regexp] implement character classes for unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/regexp/regexp-parser.cc
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc
index 07d5779675786b0dfbec11fb7a8cf8fa19f3aecb..fa8900342cfc4878411a1c06d753254024f138fe 100644
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
@@ -15,18 +15,20 @@
namespace internal {
RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
- JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
+ bool multiline, bool unicode, Isolate* isolate,
+ Zone* zone)
: isolate_(isolate),
zone_(zone),
error_(error),
captures_(NULL),
in_(in),
current_(kEndMarker),
- flags_(flags),
next_pos_(0),
captures_started_(0),
capture_count_(0),
has_more_(true),
+ multiline_(multiline),
+ unicode_(unicode),
simple_(false),
contains_anchor_(false),
is_scanned_for_captures_(false),
@@ -35,28 +37,9 @@
}
-template <bool update_position>
-uc32 RegExpParser::ReadNext() {
- int position = next_pos_;
- uc32 c0 = in()->Get(position);
- position++;
- // Read the whole surrogate pair in case of unicode flag, if possible.
- if (unicode() && position < in()->length() &&
- unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
- uc16 c1 = in()->Get(position);
- if (unibrow::Utf16::IsTrailSurrogate(c1)) {
- c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
- position++;
- }
- }
- if (update_position) next_pos_ = position;
- return c0;
-}
-
-
uc32 RegExpParser::Next() {
if (has_next()) {
- return ReadNext<false>();
+ return in()->Get(next_pos_);
} else {
return kEndMarker;
}
@@ -64,14 +47,25 @@
void RegExpParser::Advance() {
- if (has_next()) {
+ if (next_pos_ < in()->length()) {
StackLimitCheck check(isolate());
if (check.HasOverflowed()) {
ReportError(CStrVector(Isolate::kStackOverflowMessage));
} else if (zone()->excess_allocation()) {
ReportError(CStrVector("Regular expression too large"));
} else {
- current_ = ReadNext<true>();
+ current_ = in()->Get(next_pos_);
+ next_pos_++;
+ // Read the whole surrogate pair in case of unicode flag, if possible.
+ if (unicode_ && next_pos_ < in()->length() &&
+ unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) {
+ uc16 trail = in()->Get(next_pos_);
+ if (unibrow::Utf16::IsTrailSurrogate(trail)) {
+ current_ = unibrow::Utf16::CombineSurrogatePair(
+ static_cast<uc16>(current_), trail);
+ next_pos_++;
+ }
+ }
}
} else {
current_ = kEndMarker;
@@ -148,7 +142,7 @@
RegExpTree* RegExpParser::ParseDisjunction() {
// Used to store current state while parsing subexpressions.
RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
- flags_, zone());
+ zone());
RegExpParserState* state = &initial_state;
// Cache the builder in a local variable for quick access.
RegExpBuilder* builder = initial_state.builder();
@@ -212,7 +206,7 @@
return ReportError(CStrVector("Nothing to repeat"));
case '^': {
Advance();
- if (multiline()) {
+ if (multiline_) {
builder->AddAssertion(
new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
} else {
@@ -225,8 +219,8 @@
case '$': {
Advance();
RegExpAssertion::AssertionType assertion_type =
- multiline() ? RegExpAssertion::END_OF_LINE
- : RegExpAssertion::END_OF_INPUT;
+ multiline_ ? RegExpAssertion::END_OF_LINE
+ : RegExpAssertion::END_OF_INPUT;
builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
continue;
}
@@ -236,9 +230,8 @@
ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone());
CharacterRange::AddClassEscape('.', ranges, zone());
- RegExpCharacterClass* cc =
- new (zone()) RegExpCharacterClass(ranges, false);
- builder->AddCharacterClass(cc);
+ RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
+ builder->AddAtom(atom);
break;
}
case '(': {
@@ -283,15 +276,14 @@
captures_started_++;
}
// Store current state and begin new disjunction parsing.
- state =
- new (zone()) RegExpParserState(state, subexpr_type, lookaround_type,
- captures_started_, flags_, zone());
+ state = new (zone()) RegExpParserState(
+ state, subexpr_type, lookaround_type, captures_started_, zone());
builder = state->builder();
continue;
}
case '[': {
- RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
- builder->AddCharacterClass(cc->AsCharacterClass());
+ RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);
+ builder->AddAtom(atom);
break;
}
// Atom ::
@@ -326,9 +318,8 @@
ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone());
CharacterRange::AddClassEscape(c, ranges, zone());
- RegExpCharacterClass* cc =
- new (zone()) RegExpCharacterClass(ranges, false);
- builder->AddCharacterClass(cc);
+ RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
+ builder->AddAtom(atom);
break;
}
case '1':
@@ -362,7 +353,7 @@
// escaped,
// no other identity escapes are allowed. If the 'u' flag is not
// present, all identity escapes are allowed.
- if (!unicode()) {
+ if (!unicode_) {
builder->AddCharacter(first_digit);
Advance(2);
} else {
@@ -423,7 +414,7 @@
uc32 value;
if (ParseHexEscape(2, &value)) {
builder->AddCharacter(value);
- } else if (!unicode()) {
+ } else if (!unicode_) {
builder->AddCharacter('x');
} else {
// If the 'u' flag is present, invalid escapes are not treated as
@@ -437,7 +428,7 @@
uc32 value;
if (ParseUnicodeEscape(&value)) {
builder->AddUnicodeCharacter(value);
- } else if (!unicode()) {
+ } else if (!unicode_) {
builder->AddCharacter('u');
} else {
// If the 'u' flag is present, invalid escapes are not treated as
@@ -453,7 +444,7 @@
// other identity escapes are allowed. If the 'u' flag is not
// present,
// all identity escapes are allowed.
- if (!unicode() || IsSyntaxCharacter(current())) {
+ if (!unicode_ || IsSyntaxCharacter(current())) {
builder->AddCharacter(current());
Advance();
} else {
@@ -754,7 +745,7 @@
// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
// allowed). In the latter case, the number of hex digits between { } is
// arbitrary. \ and u have already been read.
- if (current() == '{' && unicode()) {
+ if (current() == '{' && unicode_) {
int start = position();
Advance();
if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
@@ -849,7 +840,7 @@
if (ParseHexEscape(2, &value)) {
return value;
}
- if (!unicode()) {
+ if (!unicode_) {
// If \x is not followed by a two-digit hexadecimal, treat it
// as an identity escape.
return 'x';
@@ -865,7 +856,7 @@
if (ParseUnicodeEscape(&value)) {
return value;
}
- if (!unicode()) {
+ if (!unicode_) {
return 'u';
}
// If the 'u' flag is present, invalid escapes are not treated as
@@ -878,7 +869,7 @@
// If the 'u' flag is present, only syntax characters can be escaped, no
// other identity escapes are allowed. If the 'u' flag is not present, all
// identity escapes are allowed.
- if (!unicode() || IsSyntaxCharacter(result)) {
+ if (!unicode_ || IsSyntaxCharacter(result)) {
Advance();
return result;
}
@@ -908,29 +899,13 @@
case kEndMarker:
return ReportError(CStrVector("\\ at end of pattern"));
default:
- first = ParseClassCharacterEscape(CHECK_FAILED);
+ uc32 c = ParseClassCharacterEscape(CHECK_FAILED);
+ return CharacterRange::Singleton(c);
}
} else {
Advance();
- }
-
- if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {
- // Combine with possibly following trail surrogate.
- int start = position();
- uc32 second = current();
- if (second == '\\') {
- second = ParseClassCharacterEscape(CHECK_FAILED);
- } else {
- Advance();
- }
- if (unibrow::Utf16::IsTrailSurrogate(second)) {
- first = unibrow::Utf16::CombineSurrogatePair(first, second);
- } else {
- Reset(start);
- }
- }
-
- return CharacterRange::Singleton(first);
+ return CharacterRange::Singleton(first);
+ }
}
@@ -1010,10 +985,10 @@
bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
- FlatStringReader* input, JSRegExp::Flags flags,
- RegExpCompileData* result) {
+ FlatStringReader* input, bool multiline,
+ bool unicode, RegExpCompileData* result) {
DCHECK(result != NULL);
- RegExpParser parser(input, &result->error, flags, isolate, zone);
+ RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);
RegExpTree* tree = parser.ParsePattern();
if (parser.failed()) {
DCHECK(tree == NULL);
@@ -1036,12 +1011,10 @@
}
-RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
+RegExpBuilder::RegExpBuilder(Zone* zone)
: zone_(zone),
pending_empty_(false),
- flags_(flags),
characters_(NULL),
- pending_surrogate_(kNoPendingSurrogate),
terms_(),
alternatives_()
#ifdef DEBUG
@@ -1052,48 +1025,7 @@
}
-void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
- DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
- FlushPendingSurrogate();
- // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
- pending_surrogate_ = lead_surrogate;
-}
-
-
-void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
- DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
- if (pending_surrogate_ != kNoPendingSurrogate) {
- uc16 lead_surrogate = pending_surrogate_;
- DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
- ZoneList<uc16> surrogate_pair(2, zone());
- surrogate_pair.Add(lead_surrogate, zone());
- surrogate_pair.Add(trail_surrogate, zone());
- RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
- pending_surrogate_ = kNoPendingSurrogate;
- AddAtom(atom);
- } else {
- pending_surrogate_ = trail_surrogate;
- FlushPendingSurrogate();
- }
-}
-
-
-void RegExpBuilder::FlushPendingSurrogate() {
- if (pending_surrogate_ != kNoPendingSurrogate) {
- // Use character class to desugar lone surrogate matching.
- RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(
- CharacterRange::List(zone(),
- CharacterRange::Singleton(pending_surrogate_)),
- false);
- pending_surrogate_ = kNoPendingSurrogate;
- DCHECK(unicode());
- AddCharacterClass(cc);
- }
-}
-
-
void RegExpBuilder::FlushCharacters() {
- FlushPendingSurrogate();
pending_empty_ = false;
if (characters_ != NULL) {
RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
@@ -1121,7 +1053,6 @@
void RegExpBuilder::AddCharacter(uc16 c) {
- FlushPendingSurrogate();
pending_empty_ = false;
if (characters_ == NULL) {
characters_ = new (zone()) ZoneList<uc16>(4, zone());
@@ -1133,13 +1064,11 @@
void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
- DCHECK(unicode());
- AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
- AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
- } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
- AddLeadSurrogate(c);
- } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
- AddTrailSurrogate(c);
+ ZoneList<uc16> surrogate_pair(2, zone());
+ surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone());
+ surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone());
+ RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
+ AddAtom(atom);
} else {
AddCharacter(static_cast<uc16>(c));
}
@@ -1147,17 +1076,6 @@
void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
-
-
-void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
- if (unicode() && cc->NeedsDesugaringForUnicode(zone())) {
- // In unicode mode, character class needs to be desugared, so it
- // must be a standalone term instead of being part of a RegExpText.
- AddTerm(cc);
- } else {
- AddAtom(cc);
- }
-}
void RegExpBuilder::AddAtom(RegExpTree* term) {
@@ -1172,13 +1090,6 @@
FlushText();
terms_.Add(term, zone());
}
- LAST(ADD_ATOM);
-}
-
-
-void RegExpBuilder::AddTerm(RegExpTree* term) {
- FlushText();
- terms_.Add(term, zone());
LAST(ADD_ATOM);
}
@@ -1221,7 +1132,6 @@
void RegExpBuilder::AddQuantifierToAtom(
int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
- FlushPendingSurrogate();
if (pending_empty_) {
pending_empty_ = false;
return;
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698