| Index: src/regexp/regexp-parser.cc | 
| diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc | 
| index fa8900342cfc4878411a1c06d753254024f138fe..07d5779675786b0dfbec11fb7a8cf8fa19f3aecb 100644 | 
| --- a/src/regexp/regexp-parser.cc | 
| +++ b/src/regexp/regexp-parser.cc | 
| @@ -15,20 +15,18 @@ namespace v8 { | 
| namespace internal { | 
|  | 
| RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, | 
| -                           bool multiline, bool unicode, Isolate* isolate, | 
| -                           Zone* zone) | 
| +                           JSRegExp::Flags flags, Isolate* isolate, Zone* zone) | 
| : isolate_(isolate), | 
| zone_(zone), | 
| error_(error), | 
| captures_(NULL), | 
| in_(in), | 
| current_(kEndMarker), | 
| +      flags_(flags), | 
| next_pos_(0), | 
| captures_started_(0), | 
| capture_count_(0), | 
| has_more_(true), | 
| -      multiline_(multiline), | 
| -      unicode_(unicode), | 
| simple_(false), | 
| contains_anchor_(false), | 
| is_scanned_for_captures_(false), | 
| @@ -37,9 +35,28 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, | 
| } | 
|  | 
|  | 
| +template <bool update_position> | 
| +uc32 RegExpParser::ReadNext() { | 
| +  int position = next_pos_; | 
| +  uc32 c0 = in()->Get(position); | 
| +  position++; | 
| +  // Read the whole surrogate pair in case of unicode flag, if possible. | 
| +  if (unicode() && position < in()->length() && | 
| +      unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { | 
| +    uc16 c1 = in()->Get(position); | 
| +    if (unibrow::Utf16::IsTrailSurrogate(c1)) { | 
| +      c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); | 
| +      position++; | 
| +    } | 
| +  } | 
| +  if (update_position) next_pos_ = position; | 
| +  return c0; | 
| +} | 
| + | 
| + | 
| uc32 RegExpParser::Next() { | 
| if (has_next()) { | 
| -    return in()->Get(next_pos_); | 
| +    return ReadNext<false>(); | 
| } else { | 
| return kEndMarker; | 
| } | 
| @@ -47,25 +64,14 @@ uc32 RegExpParser::Next() { | 
|  | 
|  | 
| void RegExpParser::Advance() { | 
| -  if (next_pos_ < in()->length()) { | 
| +  if (has_next()) { | 
| StackLimitCheck check(isolate()); | 
| if (check.HasOverflowed()) { | 
| ReportError(CStrVector(Isolate::kStackOverflowMessage)); | 
| } else if (zone()->excess_allocation()) { | 
| ReportError(CStrVector("Regular expression too large")); | 
| } else { | 
| -      current_ = in()->Get(next_pos_); | 
| -      next_pos_++; | 
| -      // Read the whole surrogate pair in case of unicode flag, if possible. | 
| -      if (unicode_ && next_pos_ < in()->length() && | 
| -          unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) { | 
| -        uc16 trail = in()->Get(next_pos_); | 
| -        if (unibrow::Utf16::IsTrailSurrogate(trail)) { | 
| -          current_ = unibrow::Utf16::CombineSurrogatePair( | 
| -              static_cast<uc16>(current_), trail); | 
| -          next_pos_++; | 
| -        } | 
| -      } | 
| +      current_ = ReadNext<true>(); | 
| } | 
| } else { | 
| current_ = kEndMarker; | 
| @@ -142,7 +148,7 @@ RegExpTree* RegExpParser::ParsePattern() { | 
| RegExpTree* RegExpParser::ParseDisjunction() { | 
| // Used to store current state while parsing subexpressions. | 
| RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, | 
| -                                  zone()); | 
| +                                  flags_, zone()); | 
| RegExpParserState* state = &initial_state; | 
| // Cache the builder in a local variable for quick access. | 
| RegExpBuilder* builder = initial_state.builder(); | 
| @@ -206,7 +212,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { | 
| return ReportError(CStrVector("Nothing to repeat")); | 
| case '^': { | 
| Advance(); | 
| -        if (multiline_) { | 
| +        if (multiline()) { | 
| builder->AddAssertion( | 
| new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); | 
| } else { | 
| @@ -219,8 +225,8 @@ RegExpTree* RegExpParser::ParseDisjunction() { | 
| case '$': { | 
| Advance(); | 
| RegExpAssertion::AssertionType assertion_type = | 
| -            multiline_ ? RegExpAssertion::END_OF_LINE | 
| -                       : RegExpAssertion::END_OF_INPUT; | 
| +            multiline() ? RegExpAssertion::END_OF_LINE | 
| +                        : RegExpAssertion::END_OF_INPUT; | 
| builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); | 
| continue; | 
| } | 
| @@ -230,8 +236,9 @@ RegExpTree* RegExpParser::ParseDisjunction() { | 
| ZoneList<CharacterRange>* ranges = | 
| new (zone()) ZoneList<CharacterRange>(2, zone()); | 
| CharacterRange::AddClassEscape('.', ranges, zone()); | 
| -        RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); | 
| -        builder->AddAtom(atom); | 
| +        RegExpCharacterClass* cc = | 
| +            new (zone()) RegExpCharacterClass(ranges, false); | 
| +        builder->AddCharacterClass(cc); | 
| break; | 
| } | 
| case '(': { | 
| @@ -276,14 +283,15 @@ RegExpTree* RegExpParser::ParseDisjunction() { | 
| captures_started_++; | 
| } | 
| // Store current state and begin new disjunction parsing. | 
| -        state = new (zone()) RegExpParserState( | 
| -            state, subexpr_type, lookaround_type, captures_started_, zone()); | 
| +        state = | 
| +            new (zone()) RegExpParserState(state, subexpr_type, lookaround_type, | 
| +                                           captures_started_, flags_, zone()); | 
| builder = state->builder(); | 
| continue; | 
| } | 
| case '[': { | 
| -        RegExpTree* atom = ParseCharacterClass(CHECK_FAILED); | 
| -        builder->AddAtom(atom); | 
| +        RegExpTree* cc = ParseCharacterClass(CHECK_FAILED); | 
| +        builder->AddCharacterClass(cc->AsCharacterClass()); | 
| break; | 
| } | 
| // Atom :: | 
| @@ -318,8 +326,9 @@ RegExpTree* RegExpParser::ParseDisjunction() { | 
| ZoneList<CharacterRange>* ranges = | 
| new (zone()) ZoneList<CharacterRange>(2, zone()); | 
| CharacterRange::AddClassEscape(c, ranges, zone()); | 
| -            RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); | 
| -            builder->AddAtom(atom); | 
| +            RegExpCharacterClass* cc = | 
| +                new (zone()) RegExpCharacterClass(ranges, false); | 
| +            builder->AddCharacterClass(cc); | 
| break; | 
| } | 
| case '1': | 
| @@ -353,7 +362,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { | 
| // escaped, | 
| // no other identity escapes are allowed. If the 'u' flag is not | 
| // present, all identity escapes are allowed. | 
| -              if (!unicode_) { | 
| +              if (!unicode()) { | 
| builder->AddCharacter(first_digit); | 
| Advance(2); | 
| } else { | 
| @@ -414,7 +423,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { | 
| uc32 value; | 
| if (ParseHexEscape(2, &value)) { | 
| builder->AddCharacter(value); | 
| -            } else if (!unicode_) { | 
| +            } else if (!unicode()) { | 
| builder->AddCharacter('x'); | 
| } else { | 
| // If the 'u' flag is present, invalid escapes are not treated as | 
| @@ -428,7 +437,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { | 
| uc32 value; | 
| if (ParseUnicodeEscape(&value)) { | 
| builder->AddUnicodeCharacter(value); | 
| -            } else if (!unicode_) { | 
| +            } else if (!unicode()) { | 
| builder->AddCharacter('u'); | 
| } else { | 
| // If the 'u' flag is present, invalid escapes are not treated as | 
| @@ -444,7 +453,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { | 
| // other identity escapes are allowed. If the 'u' flag is not | 
| // present, | 
| // all identity escapes are allowed. | 
| -            if (!unicode_ || IsSyntaxCharacter(current())) { | 
| +            if (!unicode() || IsSyntaxCharacter(current())) { | 
| builder->AddCharacter(current()); | 
| Advance(); | 
| } else { | 
| @@ -745,7 +754,7 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) { | 
| // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are | 
| // allowed). In the latter case, the number of hex digits between { } is | 
| // arbitrary. \ and u have already been read. | 
| -  if (current() == '{' && unicode_) { | 
| +  if (current() == '{' && unicode()) { | 
| int start = position(); | 
| Advance(); | 
| if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { | 
| @@ -840,7 +849,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() { | 
| if (ParseHexEscape(2, &value)) { | 
| return value; | 
| } | 
| -      if (!unicode_) { | 
| +      if (!unicode()) { | 
| // If \x is not followed by a two-digit hexadecimal, treat it | 
| // as an identity escape. | 
| return 'x'; | 
| @@ -856,7 +865,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() { | 
| if (ParseUnicodeEscape(&value)) { | 
| return value; | 
| } | 
| -      if (!unicode_) { | 
| +      if (!unicode()) { | 
| return 'u'; | 
| } | 
| // If the 'u' flag is present, invalid escapes are not treated as | 
| @@ -869,7 +878,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() { | 
| // If the 'u' flag is present, only syntax characters can be escaped, no | 
| // other identity escapes are allowed. If the 'u' flag is not present, all | 
| // identity escapes are allowed. | 
| -      if (!unicode_ || IsSyntaxCharacter(result)) { | 
| +      if (!unicode() || IsSyntaxCharacter(result)) { | 
| Advance(); | 
| return result; | 
| } | 
| @@ -899,13 +908,29 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { | 
| case kEndMarker: | 
| return ReportError(CStrVector("\\ at end of pattern")); | 
| default: | 
| -        uc32 c = ParseClassCharacterEscape(CHECK_FAILED); | 
| -        return CharacterRange::Singleton(c); | 
| +        first = ParseClassCharacterEscape(CHECK_FAILED); | 
| } | 
| } else { | 
| Advance(); | 
| -    return CharacterRange::Singleton(first); | 
| } | 
| + | 
| +  if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) { | 
| +    // Combine with possibly following trail surrogate. | 
| +    int start = position(); | 
| +    uc32 second = current(); | 
| +    if (second == '\\') { | 
| +      second = ParseClassCharacterEscape(CHECK_FAILED); | 
| +    } else { | 
| +      Advance(); | 
| +    } | 
| +    if (unibrow::Utf16::IsTrailSurrogate(second)) { | 
| +      first = unibrow::Utf16::CombineSurrogatePair(first, second); | 
| +    } else { | 
| +      Reset(start); | 
| +    } | 
| +  } | 
| + | 
| +  return CharacterRange::Singleton(first); | 
| } | 
|  | 
|  | 
| @@ -985,10 +1010,10 @@ RegExpTree* RegExpParser::ParseCharacterClass() { | 
|  | 
|  | 
| bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, | 
| -                               FlatStringReader* input, bool multiline, | 
| -                               bool unicode, RegExpCompileData* result) { | 
| +                               FlatStringReader* input, JSRegExp::Flags flags, | 
| +                               RegExpCompileData* result) { | 
| DCHECK(result != NULL); | 
| -  RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone); | 
| +  RegExpParser parser(input, &result->error, flags, isolate, zone); | 
| RegExpTree* tree = parser.ParsePattern(); | 
| if (parser.failed()) { | 
| DCHECK(tree == NULL); | 
| @@ -1011,10 +1036,12 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, | 
| } | 
|  | 
|  | 
| -RegExpBuilder::RegExpBuilder(Zone* zone) | 
| +RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags) | 
| : zone_(zone), | 
| pending_empty_(false), | 
| +      flags_(flags), | 
| characters_(NULL), | 
| +      pending_surrogate_(kNoPendingSurrogate), | 
| terms_(), | 
| alternatives_() | 
| #ifdef DEBUG | 
| @@ -1025,7 +1052,48 @@ RegExpBuilder::RegExpBuilder(Zone* zone) | 
| } | 
|  | 
|  | 
| +void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) { | 
| +  DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); | 
| +  FlushPendingSurrogate(); | 
| +  // Hold onto the lead surrogate, waiting for a trail surrogate to follow. | 
| +  pending_surrogate_ = lead_surrogate; | 
| +} | 
| + | 
| + | 
| +void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { | 
| +  DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); | 
| +  if (pending_surrogate_ != kNoPendingSurrogate) { | 
| +    uc16 lead_surrogate = pending_surrogate_; | 
| +    DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); | 
| +    ZoneList<uc16> surrogate_pair(2, zone()); | 
| +    surrogate_pair.Add(lead_surrogate, zone()); | 
| +    surrogate_pair.Add(trail_surrogate, zone()); | 
| +    RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); | 
| +    pending_surrogate_ = kNoPendingSurrogate; | 
| +    AddAtom(atom); | 
| +  } else { | 
| +    pending_surrogate_ = trail_surrogate; | 
| +    FlushPendingSurrogate(); | 
| +  } | 
| +} | 
| + | 
| + | 
| +void RegExpBuilder::FlushPendingSurrogate() { | 
| +  if (pending_surrogate_ != kNoPendingSurrogate) { | 
| +    // Use character class to desugar lone surrogate matching. | 
| +    RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass( | 
| +        CharacterRange::List(zone(), | 
| +                             CharacterRange::Singleton(pending_surrogate_)), | 
| +        false); | 
| +    pending_surrogate_ = kNoPendingSurrogate; | 
| +    DCHECK(unicode()); | 
| +    AddCharacterClass(cc); | 
| +  } | 
| +} | 
| + | 
| + | 
| void RegExpBuilder::FlushCharacters() { | 
| +  FlushPendingSurrogate(); | 
| pending_empty_ = false; | 
| if (characters_ != NULL) { | 
| RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); | 
| @@ -1053,6 +1121,7 @@ void RegExpBuilder::FlushText() { | 
|  | 
|  | 
| void RegExpBuilder::AddCharacter(uc16 c) { | 
| +  FlushPendingSurrogate(); | 
| pending_empty_ = false; | 
| if (characters_ == NULL) { | 
| characters_ = new (zone()) ZoneList<uc16>(4, zone()); | 
| @@ -1064,11 +1133,13 @@ void RegExpBuilder::AddCharacter(uc16 c) { | 
|  | 
| void RegExpBuilder::AddUnicodeCharacter(uc32 c) { | 
| if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { | 
| -    ZoneList<uc16> surrogate_pair(2, zone()); | 
| -    surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone()); | 
| -    surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone()); | 
| -    RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); | 
| -    AddAtom(atom); | 
| +    DCHECK(unicode()); | 
| +    AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); | 
| +    AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); | 
| +  } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { | 
| +    AddLeadSurrogate(c); | 
| +  } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { | 
| +    AddTrailSurrogate(c); | 
| } else { | 
| AddCharacter(static_cast<uc16>(c)); | 
| } | 
| @@ -1078,6 +1149,17 @@ void RegExpBuilder::AddUnicodeCharacter(uc32 c) { | 
| void RegExpBuilder::AddEmpty() { pending_empty_ = true; } | 
|  | 
|  | 
| +void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { | 
| +  if (unicode() && cc->NeedsDesugaringForUnicode(zone())) { | 
| +    // In unicode mode, character class needs to be desugared, so it | 
| +    // must be a standalone term instead of being part of a RegExpText. | 
| +    AddTerm(cc); | 
| +  } else { | 
| +    AddAtom(cc); | 
| +  } | 
| +} | 
| + | 
| + | 
| void RegExpBuilder::AddAtom(RegExpTree* term) { | 
| if (term->IsEmpty()) { | 
| AddEmpty(); | 
| @@ -1094,6 +1176,13 @@ void RegExpBuilder::AddAtom(RegExpTree* term) { | 
| } | 
|  | 
|  | 
| +void RegExpBuilder::AddTerm(RegExpTree* term) { | 
| +  FlushText(); | 
| +  terms_.Add(term, zone()); | 
| +  LAST(ADD_ATOM); | 
| +} | 
| + | 
| + | 
| void RegExpBuilder::AddAssertion(RegExpTree* assert) { | 
| FlushText(); | 
| terms_.Add(assert, zone()); | 
| @@ -1132,6 +1221,7 @@ RegExpTree* RegExpBuilder::ToRegExp() { | 
|  | 
| void RegExpBuilder::AddQuantifierToAtom( | 
| int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { | 
| +  FlushPendingSurrogate(); | 
| if (pending_empty_) { | 
| pending_empty_ = false; | 
| return; | 
|  |