Index: src/regexp/regexp-parser.cc |
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc |
index fa8900342cfc4878411a1c06d753254024f138fe..07d5779675786b0dfbec11fb7a8cf8fa19f3aecb 100644 |
--- a/src/regexp/regexp-parser.cc |
+++ b/src/regexp/regexp-parser.cc |
@@ -15,20 +15,18 @@ namespace v8 { |
namespace internal { |
RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, |
- bool multiline, bool unicode, Isolate* isolate, |
- Zone* zone) |
+ JSRegExp::Flags flags, Isolate* isolate, Zone* zone) |
: isolate_(isolate), |
zone_(zone), |
error_(error), |
captures_(NULL), |
in_(in), |
current_(kEndMarker), |
+ flags_(flags), |
next_pos_(0), |
captures_started_(0), |
capture_count_(0), |
has_more_(true), |
- multiline_(multiline), |
- unicode_(unicode), |
simple_(false), |
contains_anchor_(false), |
is_scanned_for_captures_(false), |
@@ -37,9 +35,28 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, |
} |
+template <bool update_position> |
+uc32 RegExpParser::ReadNext() { |
+ int position = next_pos_; |
+ uc32 c0 = in()->Get(position); |
+ position++; |
+ // Read the whole surrogate pair in case of unicode flag, if possible. |
+ if (unicode() && position < in()->length() && |
+ unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { |
+ uc16 c1 = in()->Get(position); |
+ if (unibrow::Utf16::IsTrailSurrogate(c1)) { |
+ c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); |
+ position++; |
+ } |
+ } |
+ if (update_position) next_pos_ = position; |
+ return c0; |
+} |
+ |
+ |
uc32 RegExpParser::Next() { |
if (has_next()) { |
- return in()->Get(next_pos_); |
+ return ReadNext<false>(); |
} else { |
return kEndMarker; |
} |
@@ -47,25 +64,14 @@ uc32 RegExpParser::Next() { |
void RegExpParser::Advance() { |
- if (next_pos_ < in()->length()) { |
+ if (has_next()) { |
StackLimitCheck check(isolate()); |
if (check.HasOverflowed()) { |
ReportError(CStrVector(Isolate::kStackOverflowMessage)); |
} else if (zone()->excess_allocation()) { |
ReportError(CStrVector("Regular expression too large")); |
} else { |
- current_ = in()->Get(next_pos_); |
- next_pos_++; |
- // Read the whole surrogate pair in case of unicode flag, if possible. |
- if (unicode_ && next_pos_ < in()->length() && |
- unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) { |
- uc16 trail = in()->Get(next_pos_); |
- if (unibrow::Utf16::IsTrailSurrogate(trail)) { |
- current_ = unibrow::Utf16::CombineSurrogatePair( |
- static_cast<uc16>(current_), trail); |
- next_pos_++; |
- } |
- } |
+ current_ = ReadNext<true>(); |
} |
} else { |
current_ = kEndMarker; |
@@ -142,7 +148,7 @@ RegExpTree* RegExpParser::ParsePattern() { |
RegExpTree* RegExpParser::ParseDisjunction() { |
// Used to store current state while parsing subexpressions. |
RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, |
- zone()); |
+ flags_, zone()); |
RegExpParserState* state = &initial_state; |
// Cache the builder in a local variable for quick access. |
RegExpBuilder* builder = initial_state.builder(); |
@@ -206,7 +212,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
return ReportError(CStrVector("Nothing to repeat")); |
case '^': { |
Advance(); |
- if (multiline_) { |
+ if (multiline()) { |
builder->AddAssertion( |
new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); |
} else { |
@@ -219,8 +225,8 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
case '$': { |
Advance(); |
RegExpAssertion::AssertionType assertion_type = |
- multiline_ ? RegExpAssertion::END_OF_LINE |
- : RegExpAssertion::END_OF_INPUT; |
+ multiline() ? RegExpAssertion::END_OF_LINE |
+ : RegExpAssertion::END_OF_INPUT; |
builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); |
continue; |
} |
@@ -230,8 +236,9 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
ZoneList<CharacterRange>* ranges = |
new (zone()) ZoneList<CharacterRange>(2, zone()); |
CharacterRange::AddClassEscape('.', ranges, zone()); |
- RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); |
- builder->AddAtom(atom); |
+ RegExpCharacterClass* cc = |
+ new (zone()) RegExpCharacterClass(ranges, false); |
+ builder->AddCharacterClass(cc); |
break; |
} |
case '(': { |
@@ -276,14 +283,15 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
captures_started_++; |
} |
// Store current state and begin new disjunction parsing. |
- state = new (zone()) RegExpParserState( |
- state, subexpr_type, lookaround_type, captures_started_, zone()); |
+ state = |
+ new (zone()) RegExpParserState(state, subexpr_type, lookaround_type, |
+ captures_started_, flags_, zone()); |
builder = state->builder(); |
continue; |
} |
case '[': { |
- RegExpTree* atom = ParseCharacterClass(CHECK_FAILED); |
- builder->AddAtom(atom); |
+ RegExpTree* cc = ParseCharacterClass(CHECK_FAILED); |
+ builder->AddCharacterClass(cc->AsCharacterClass()); |
break; |
} |
// Atom :: |
@@ -318,8 +326,9 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
ZoneList<CharacterRange>* ranges = |
new (zone()) ZoneList<CharacterRange>(2, zone()); |
CharacterRange::AddClassEscape(c, ranges, zone()); |
- RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); |
- builder->AddAtom(atom); |
+ RegExpCharacterClass* cc = |
+ new (zone()) RegExpCharacterClass(ranges, false); |
+ builder->AddCharacterClass(cc); |
break; |
} |
case '1': |
@@ -353,7 +362,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
// escaped, |
// no other identity escapes are allowed. If the 'u' flag is not |
// present, all identity escapes are allowed. |
- if (!unicode_) { |
+ if (!unicode()) { |
builder->AddCharacter(first_digit); |
Advance(2); |
} else { |
@@ -414,7 +423,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
uc32 value; |
if (ParseHexEscape(2, &value)) { |
builder->AddCharacter(value); |
- } else if (!unicode_) { |
+ } else if (!unicode()) { |
builder->AddCharacter('x'); |
} else { |
// If the 'u' flag is present, invalid escapes are not treated as |
@@ -428,7 +437,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
uc32 value; |
if (ParseUnicodeEscape(&value)) { |
builder->AddUnicodeCharacter(value); |
- } else if (!unicode_) { |
+ } else if (!unicode()) { |
builder->AddCharacter('u'); |
} else { |
// If the 'u' flag is present, invalid escapes are not treated as |
@@ -444,7 +453,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
// other identity escapes are allowed. If the 'u' flag is not |
// present, |
// all identity escapes are allowed. |
- if (!unicode_ || IsSyntaxCharacter(current())) { |
+ if (!unicode() || IsSyntaxCharacter(current())) { |
builder->AddCharacter(current()); |
Advance(); |
} else { |
@@ -745,7 +754,7 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
// allowed). In the latter case, the number of hex digits between { } is |
// arbitrary. \ and u have already been read. |
- if (current() == '{' && unicode_) { |
+ if (current() == '{' && unicode()) { |
int start = position(); |
Advance(); |
if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { |
@@ -840,7 +849,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() { |
if (ParseHexEscape(2, &value)) { |
return value; |
} |
- if (!unicode_) { |
+ if (!unicode()) { |
// If \x is not followed by a two-digit hexadecimal, treat it |
// as an identity escape. |
return 'x'; |
@@ -856,7 +865,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() { |
if (ParseUnicodeEscape(&value)) { |
return value; |
} |
- if (!unicode_) { |
+ if (!unicode()) { |
return 'u'; |
} |
// If the 'u' flag is present, invalid escapes are not treated as |
@@ -869,7 +878,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() { |
// If the 'u' flag is present, only syntax characters can be escaped, no |
// other identity escapes are allowed. If the 'u' flag is not present, all |
// identity escapes are allowed. |
- if (!unicode_ || IsSyntaxCharacter(result)) { |
+ if (!unicode() || IsSyntaxCharacter(result)) { |
Advance(); |
return result; |
} |
@@ -899,13 +908,29 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { |
case kEndMarker: |
return ReportError(CStrVector("\\ at end of pattern")); |
default: |
- uc32 c = ParseClassCharacterEscape(CHECK_FAILED); |
- return CharacterRange::Singleton(c); |
+ first = ParseClassCharacterEscape(CHECK_FAILED); |
} |
} else { |
Advance(); |
- return CharacterRange::Singleton(first); |
} |
+ |
+ if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) { |
+ // Combine with possibly following trail surrogate. |
+ int start = position(); |
+ uc32 second = current(); |
+ if (second == '\\') { |
+ second = ParseClassCharacterEscape(CHECK_FAILED); |
+ } else { |
+ Advance(); |
+ } |
+ if (unibrow::Utf16::IsTrailSurrogate(second)) { |
+ first = unibrow::Utf16::CombineSurrogatePair(first, second); |
+ } else { |
+ Reset(start); |
+ } |
+ } |
+ |
+ return CharacterRange::Singleton(first); |
} |
@@ -985,10 +1010,10 @@ RegExpTree* RegExpParser::ParseCharacterClass() { |
bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, |
- FlatStringReader* input, bool multiline, |
- bool unicode, RegExpCompileData* result) { |
+ FlatStringReader* input, JSRegExp::Flags flags, |
+ RegExpCompileData* result) { |
DCHECK(result != NULL); |
- RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone); |
+ RegExpParser parser(input, &result->error, flags, isolate, zone); |
RegExpTree* tree = parser.ParsePattern(); |
if (parser.failed()) { |
DCHECK(tree == NULL); |
@@ -1011,10 +1036,12 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, |
} |
-RegExpBuilder::RegExpBuilder(Zone* zone) |
+RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags) |
: zone_(zone), |
pending_empty_(false), |
+ flags_(flags), |
characters_(NULL), |
+ pending_surrogate_(kNoPendingSurrogate), |
terms_(), |
alternatives_() |
#ifdef DEBUG |
@@ -1025,7 +1052,48 @@ RegExpBuilder::RegExpBuilder(Zone* zone) |
} |
+void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) { |
+ DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
+ FlushPendingSurrogate(); |
+ // Hold onto the lead surrogate, waiting for a trail surrogate to follow. |
+ pending_surrogate_ = lead_surrogate; |
+} |
+ |
+ |
+void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { |
+ DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); |
+ if (pending_surrogate_ != kNoPendingSurrogate) { |
+ uc16 lead_surrogate = pending_surrogate_; |
+ DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
+ ZoneList<uc16> surrogate_pair(2, zone()); |
+ surrogate_pair.Add(lead_surrogate, zone()); |
+ surrogate_pair.Add(trail_surrogate, zone()); |
+ RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); |
+ pending_surrogate_ = kNoPendingSurrogate; |
+ AddAtom(atom); |
+ } else { |
+ pending_surrogate_ = trail_surrogate; |
+ FlushPendingSurrogate(); |
+ } |
+} |
+ |
+ |
+void RegExpBuilder::FlushPendingSurrogate() { |
+ if (pending_surrogate_ != kNoPendingSurrogate) { |
+ // Use character class to desugar lone surrogate matching. |
+ RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass( |
+ CharacterRange::List(zone(), |
+ CharacterRange::Singleton(pending_surrogate_)), |
+ false); |
+ pending_surrogate_ = kNoPendingSurrogate; |
+ DCHECK(unicode()); |
+ AddCharacterClass(cc); |
+ } |
+} |
+ |
+ |
void RegExpBuilder::FlushCharacters() { |
+ FlushPendingSurrogate(); |
pending_empty_ = false; |
if (characters_ != NULL) { |
RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); |
@@ -1053,6 +1121,7 @@ void RegExpBuilder::FlushText() { |
void RegExpBuilder::AddCharacter(uc16 c) { |
+ FlushPendingSurrogate(); |
pending_empty_ = false; |
if (characters_ == NULL) { |
characters_ = new (zone()) ZoneList<uc16>(4, zone()); |
@@ -1064,11 +1133,13 @@ void RegExpBuilder::AddCharacter(uc16 c) { |
void RegExpBuilder::AddUnicodeCharacter(uc32 c) { |
if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { |
- ZoneList<uc16> surrogate_pair(2, zone()); |
- surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone()); |
- surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone()); |
- RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); |
- AddAtom(atom); |
+ DCHECK(unicode()); |
+ AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); |
+ AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); |
+ } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { |
+ AddLeadSurrogate(c); |
+ } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { |
+ AddTrailSurrogate(c); |
} else { |
AddCharacter(static_cast<uc16>(c)); |
} |
@@ -1078,6 +1149,17 @@ void RegExpBuilder::AddUnicodeCharacter(uc32 c) { |
void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
+void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
+ if (unicode() && cc->NeedsDesugaringForUnicode(zone())) { |
+ // In unicode mode, character class needs to be desugared, so it |
+ // must be a standalone term instead of being part of a RegExpText. |
+ AddTerm(cc); |
+ } else { |
+ AddAtom(cc); |
+ } |
+} |
+ |
+ |
void RegExpBuilder::AddAtom(RegExpTree* term) { |
if (term->IsEmpty()) { |
AddEmpty(); |
@@ -1094,6 +1176,13 @@ void RegExpBuilder::AddAtom(RegExpTree* term) { |
} |
+void RegExpBuilder::AddTerm(RegExpTree* term) { |
+ FlushText(); |
+ terms_.Add(term, zone()); |
+ LAST(ADD_ATOM); |
+} |
+ |
+ |
void RegExpBuilder::AddAssertion(RegExpTree* assert) { |
FlushText(); |
terms_.Add(assert, zone()); |
@@ -1132,6 +1221,7 @@ RegExpTree* RegExpBuilder::ToRegExp() { |
void RegExpBuilder::AddQuantifierToAtom( |
int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { |
+ FlushPendingSurrogate(); |
if (pending_empty_) { |
pending_empty_ = false; |
return; |