Index: src/regexp/regexp-parser.cc |
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc |
index 07d5779675786b0dfbec11fb7a8cf8fa19f3aecb..fa8900342cfc4878411a1c06d753254024f138fe 100644 |
--- a/src/regexp/regexp-parser.cc |
+++ b/src/regexp/regexp-parser.cc |
@@ -15,18 +15,20 @@ |
namespace internal { |
RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, |
- JSRegExp::Flags flags, Isolate* isolate, Zone* zone) |
+ bool multiline, bool unicode, Isolate* isolate, |
+ Zone* zone) |
: isolate_(isolate), |
zone_(zone), |
error_(error), |
captures_(NULL), |
in_(in), |
current_(kEndMarker), |
- flags_(flags), |
next_pos_(0), |
captures_started_(0), |
capture_count_(0), |
has_more_(true), |
+ multiline_(multiline), |
+ unicode_(unicode), |
simple_(false), |
contains_anchor_(false), |
is_scanned_for_captures_(false), |
@@ -35,28 +37,9 @@ |
} |
-template <bool update_position> |
-uc32 RegExpParser::ReadNext() { |
- int position = next_pos_; |
- uc32 c0 = in()->Get(position); |
- position++; |
- // Read the whole surrogate pair in case of unicode flag, if possible. |
- if (unicode() && position < in()->length() && |
- unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { |
- uc16 c1 = in()->Get(position); |
- if (unibrow::Utf16::IsTrailSurrogate(c1)) { |
- c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); |
- position++; |
- } |
- } |
- if (update_position) next_pos_ = position; |
- return c0; |
-} |
- |
- |
uc32 RegExpParser::Next() { |
if (has_next()) { |
- return ReadNext<false>(); |
+ return in()->Get(next_pos_); |
} else { |
return kEndMarker; |
} |
@@ -64,14 +47,25 @@ |
void RegExpParser::Advance() { |
- if (has_next()) { |
+ if (next_pos_ < in()->length()) { |
StackLimitCheck check(isolate()); |
if (check.HasOverflowed()) { |
ReportError(CStrVector(Isolate::kStackOverflowMessage)); |
} else if (zone()->excess_allocation()) { |
ReportError(CStrVector("Regular expression too large")); |
} else { |
- current_ = ReadNext<true>(); |
+ current_ = in()->Get(next_pos_); |
+ next_pos_++; |
+ // Read the whole surrogate pair in case of unicode flag, if possible. |
+ if (unicode_ && next_pos_ < in()->length() && |
+ unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) { |
+ uc16 trail = in()->Get(next_pos_); |
+ if (unibrow::Utf16::IsTrailSurrogate(trail)) { |
+ current_ = unibrow::Utf16::CombineSurrogatePair( |
+ static_cast<uc16>(current_), trail); |
+ next_pos_++; |
+ } |
+ } |
} |
} else { |
current_ = kEndMarker; |
@@ -148,7 +142,7 @@ |
RegExpTree* RegExpParser::ParseDisjunction() { |
// Used to store current state while parsing subexpressions. |
RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, |
- flags_, zone()); |
+ zone()); |
RegExpParserState* state = &initial_state; |
// Cache the builder in a local variable for quick access. |
RegExpBuilder* builder = initial_state.builder(); |
@@ -212,7 +206,7 @@ |
return ReportError(CStrVector("Nothing to repeat")); |
case '^': { |
Advance(); |
- if (multiline()) { |
+ if (multiline_) { |
builder->AddAssertion( |
new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); |
} else { |
@@ -225,8 +219,8 @@ |
case '$': { |
Advance(); |
RegExpAssertion::AssertionType assertion_type = |
- multiline() ? RegExpAssertion::END_OF_LINE |
- : RegExpAssertion::END_OF_INPUT; |
+ multiline_ ? RegExpAssertion::END_OF_LINE |
+ : RegExpAssertion::END_OF_INPUT; |
builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); |
continue; |
} |
@@ -236,9 +230,8 @@ |
ZoneList<CharacterRange>* ranges = |
new (zone()) ZoneList<CharacterRange>(2, zone()); |
CharacterRange::AddClassEscape('.', ranges, zone()); |
- RegExpCharacterClass* cc = |
- new (zone()) RegExpCharacterClass(ranges, false); |
- builder->AddCharacterClass(cc); |
+ RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); |
+ builder->AddAtom(atom); |
break; |
} |
case '(': { |
@@ -283,15 +276,14 @@ |
captures_started_++; |
} |
// Store current state and begin new disjunction parsing. |
- state = |
- new (zone()) RegExpParserState(state, subexpr_type, lookaround_type, |
- captures_started_, flags_, zone()); |
+ state = new (zone()) RegExpParserState( |
+ state, subexpr_type, lookaround_type, captures_started_, zone()); |
builder = state->builder(); |
continue; |
} |
case '[': { |
- RegExpTree* cc = ParseCharacterClass(CHECK_FAILED); |
- builder->AddCharacterClass(cc->AsCharacterClass()); |
+ RegExpTree* atom = ParseCharacterClass(CHECK_FAILED); |
+ builder->AddAtom(atom); |
break; |
} |
// Atom :: |
@@ -326,9 +318,8 @@ |
ZoneList<CharacterRange>* ranges = |
new (zone()) ZoneList<CharacterRange>(2, zone()); |
CharacterRange::AddClassEscape(c, ranges, zone()); |
- RegExpCharacterClass* cc = |
- new (zone()) RegExpCharacterClass(ranges, false); |
- builder->AddCharacterClass(cc); |
+ RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); |
+ builder->AddAtom(atom); |
break; |
} |
case '1': |
@@ -362,7 +353,7 @@ |
// escaped, |
// no other identity escapes are allowed. If the 'u' flag is not |
// present, all identity escapes are allowed. |
- if (!unicode()) { |
+ if (!unicode_) { |
builder->AddCharacter(first_digit); |
Advance(2); |
} else { |
@@ -423,7 +414,7 @@ |
uc32 value; |
if (ParseHexEscape(2, &value)) { |
builder->AddCharacter(value); |
- } else if (!unicode()) { |
+ } else if (!unicode_) { |
builder->AddCharacter('x'); |
} else { |
// If the 'u' flag is present, invalid escapes are not treated as |
@@ -437,7 +428,7 @@ |
uc32 value; |
if (ParseUnicodeEscape(&value)) { |
builder->AddUnicodeCharacter(value); |
- } else if (!unicode()) { |
+ } else if (!unicode_) { |
builder->AddCharacter('u'); |
} else { |
// If the 'u' flag is present, invalid escapes are not treated as |
@@ -453,7 +444,7 @@ |
// other identity escapes are allowed. If the 'u' flag is not |
// present, |
// all identity escapes are allowed. |
- if (!unicode() || IsSyntaxCharacter(current())) { |
+ if (!unicode_ || IsSyntaxCharacter(current())) { |
builder->AddCharacter(current()); |
Advance(); |
} else { |
@@ -754,7 +745,7 @@ |
// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
// allowed). In the latter case, the number of hex digits between { } is |
// arbitrary. \ and u have already been read. |
- if (current() == '{' && unicode()) { |
+ if (current() == '{' && unicode_) { |
int start = position(); |
Advance(); |
if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { |
@@ -849,7 +840,7 @@ |
if (ParseHexEscape(2, &value)) { |
return value; |
} |
- if (!unicode()) { |
+ if (!unicode_) { |
// If \x is not followed by a two-digit hexadecimal, treat it |
// as an identity escape. |
return 'x'; |
@@ -865,7 +856,7 @@ |
if (ParseUnicodeEscape(&value)) { |
return value; |
} |
- if (!unicode()) { |
+ if (!unicode_) { |
return 'u'; |
} |
// If the 'u' flag is present, invalid escapes are not treated as |
@@ -878,7 +869,7 @@ |
// If the 'u' flag is present, only syntax characters can be escaped, no |
// other identity escapes are allowed. If the 'u' flag is not present, all |
// identity escapes are allowed. |
- if (!unicode() || IsSyntaxCharacter(result)) { |
+ if (!unicode_ || IsSyntaxCharacter(result)) { |
Advance(); |
return result; |
} |
@@ -908,29 +899,13 @@ |
case kEndMarker: |
return ReportError(CStrVector("\\ at end of pattern")); |
default: |
- first = ParseClassCharacterEscape(CHECK_FAILED); |
+ uc32 c = ParseClassCharacterEscape(CHECK_FAILED); |
+ return CharacterRange::Singleton(c); |
} |
} else { |
Advance(); |
- } |
- |
- if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) { |
- // Combine with possibly following trail surrogate. |
- int start = position(); |
- uc32 second = current(); |
- if (second == '\\') { |
- second = ParseClassCharacterEscape(CHECK_FAILED); |
- } else { |
- Advance(); |
- } |
- if (unibrow::Utf16::IsTrailSurrogate(second)) { |
- first = unibrow::Utf16::CombineSurrogatePair(first, second); |
- } else { |
- Reset(start); |
- } |
- } |
- |
- return CharacterRange::Singleton(first); |
+ return CharacterRange::Singleton(first); |
+ } |
} |
@@ -1010,10 +985,10 @@ |
bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, |
- FlatStringReader* input, JSRegExp::Flags flags, |
- RegExpCompileData* result) { |
+ FlatStringReader* input, bool multiline, |
+ bool unicode, RegExpCompileData* result) { |
DCHECK(result != NULL); |
- RegExpParser parser(input, &result->error, flags, isolate, zone); |
+ RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone); |
RegExpTree* tree = parser.ParsePattern(); |
if (parser.failed()) { |
DCHECK(tree == NULL); |
@@ -1036,12 +1011,10 @@ |
} |
-RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags) |
+RegExpBuilder::RegExpBuilder(Zone* zone) |
: zone_(zone), |
pending_empty_(false), |
- flags_(flags), |
characters_(NULL), |
- pending_surrogate_(kNoPendingSurrogate), |
terms_(), |
alternatives_() |
#ifdef DEBUG |
@@ -1052,48 +1025,7 @@ |
} |
-void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) { |
- DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
- FlushPendingSurrogate(); |
- // Hold onto the lead surrogate, waiting for a trail surrogate to follow. |
- pending_surrogate_ = lead_surrogate; |
-} |
- |
- |
-void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { |
- DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); |
- if (pending_surrogate_ != kNoPendingSurrogate) { |
- uc16 lead_surrogate = pending_surrogate_; |
- DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
- ZoneList<uc16> surrogate_pair(2, zone()); |
- surrogate_pair.Add(lead_surrogate, zone()); |
- surrogate_pair.Add(trail_surrogate, zone()); |
- RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); |
- pending_surrogate_ = kNoPendingSurrogate; |
- AddAtom(atom); |
- } else { |
- pending_surrogate_ = trail_surrogate; |
- FlushPendingSurrogate(); |
- } |
-} |
- |
- |
-void RegExpBuilder::FlushPendingSurrogate() { |
- if (pending_surrogate_ != kNoPendingSurrogate) { |
- // Use character class to desugar lone surrogate matching. |
- RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass( |
- CharacterRange::List(zone(), |
- CharacterRange::Singleton(pending_surrogate_)), |
- false); |
- pending_surrogate_ = kNoPendingSurrogate; |
- DCHECK(unicode()); |
- AddCharacterClass(cc); |
- } |
-} |
- |
- |
void RegExpBuilder::FlushCharacters() { |
- FlushPendingSurrogate(); |
pending_empty_ = false; |
if (characters_ != NULL) { |
RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); |
@@ -1121,7 +1053,6 @@ |
void RegExpBuilder::AddCharacter(uc16 c) { |
- FlushPendingSurrogate(); |
pending_empty_ = false; |
if (characters_ == NULL) { |
characters_ = new (zone()) ZoneList<uc16>(4, zone()); |
@@ -1133,13 +1064,11 @@ |
void RegExpBuilder::AddUnicodeCharacter(uc32 c) { |
if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { |
- DCHECK(unicode()); |
- AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); |
- AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); |
- } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { |
- AddLeadSurrogate(c); |
- } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { |
- AddTrailSurrogate(c); |
+ ZoneList<uc16> surrogate_pair(2, zone()); |
+ surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone()); |
+ surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone()); |
+ RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); |
+ AddAtom(atom); |
} else { |
AddCharacter(static_cast<uc16>(c)); |
} |
@@ -1147,17 +1076,6 @@ |
void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
- |
- |
-void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
- if (unicode() && cc->NeedsDesugaringForUnicode(zone())) { |
- // In unicode mode, character class needs to be desugared, so it |
- // must be a standalone term instead of being part of a RegExpText. |
- AddTerm(cc); |
- } else { |
- AddAtom(cc); |
- } |
-} |
void RegExpBuilder::AddAtom(RegExpTree* term) { |
@@ -1172,13 +1090,6 @@ |
FlushText(); |
terms_.Add(term, zone()); |
} |
- LAST(ADD_ATOM); |
-} |
- |
- |
-void RegExpBuilder::AddTerm(RegExpTree* term) { |
- FlushText(); |
- terms_.Add(term, zone()); |
LAST(ADD_ATOM); |
} |
@@ -1221,7 +1132,6 @@ |
void RegExpBuilder::AddQuantifierToAtom( |
int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { |
- FlushPendingSurrogate(); |
if (pending_empty_) { |
pending_empty_ = false; |
return; |