src/regexp/regexp-parser.cc - Issue 1578253005: [regexp] implement character classes for unicode regexps.

Unified Diff: src/regexp/regexp-parser.cc

Issue 1578253005: [regexp] implement character classes for unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: more tests Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/regexp/regexp-parser.cc

diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc

index fa8900342cfc4878411a1c06d753254024f138fe..07d5779675786b0dfbec11fb7a8cf8fa19f3aecb 100644

--- a/src/regexp/regexp-parser.cc

+++ b/src/regexp/regexp-parser.cc

@@ -15,20 +15,18 @@ namespace v8 {

namespace internal {

RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,

- bool multiline, bool unicode, Isolate* isolate,

- Zone* zone)

+ JSRegExp::Flags flags, Isolate* isolate, Zone* zone)

: isolate_(isolate),

zone_(zone),

error_(error),

captures_(NULL),

in_(in),

current_(kEndMarker),

+ flags_(flags),

next_pos_(0),

captures_started_(0),

capture_count_(0),

has_more_(true),

- multiline_(multiline),

- unicode_(unicode),

simple_(false),

contains_anchor_(false),

is_scanned_for_captures_(false),

@@ -37,9 +35,28 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,

}

+template <bool update_position>

+uc32 RegExpParser::ReadNext() {

+ int position = next_pos_;

+ uc32 c0 = in()->Get(position);

+ position++;

+ // Read the whole surrogate pair in case of unicode flag, if possible.

+ if (unicode() && position < in()->length() &&

+ unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {

+ uc16 c1 = in()->Get(position);

+ if (unibrow::Utf16::IsTrailSurrogate(c1)) {

+ c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);

+ position++;

+ }

+ if (update_position) next_pos_ = position;

+ return c0;

uc32 RegExpParser::Next() {

if (has_next()) {

- return in()->Get(next_pos_);

+ return ReadNext<false>();

} else {

return kEndMarker;

}

@@ -47,25 +64,14 @@ uc32 RegExpParser::Next() {

void RegExpParser::Advance() {

- if (next_pos_ < in()->length()) {

+ if (has_next()) {

StackLimitCheck check(isolate());

if (check.HasOverflowed()) {

ReportError(CStrVector(Isolate::kStackOverflowMessage));

} else if (zone()->excess_allocation()) {

ReportError(CStrVector("Regular expression too large"));

} else {

- current_ = in()->Get(next_pos_);

- next_pos_++;

- // Read the whole surrogate pair in case of unicode flag, if possible.

- if (unicode_ && next_pos_ < in()->length() &&

- unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) {

- uc16 trail = in()->Get(next_pos_);

- if (unibrow::Utf16::IsTrailSurrogate(trail)) {

- current_ = unibrow::Utf16::CombineSurrogatePair(

- static_cast<uc16>(current_), trail);

- next_pos_++;

- }

+ current_ = ReadNext<true>();

}

} else {

current_ = kEndMarker;

@@ -142,7 +148,7 @@ RegExpTree* RegExpParser::ParsePattern() {

RegExpTree* RegExpParser::ParseDisjunction() {

// Used to store current state while parsing subexpressions.

RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,

- zone());

+ flags_, zone());

RegExpParserState* state = &initial_state;

// Cache the builder in a local variable for quick access.

RegExpBuilder* builder = initial_state.builder();

@@ -206,7 +212,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {

return ReportError(CStrVector("Nothing to repeat"));

case '^': {

Advance();

- if (multiline_) {

+ if (multiline()) {

builder->AddAssertion(

new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));

} else {

@@ -219,8 +225,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {

case '$': {

Advance();

RegExpAssertion::AssertionType assertion_type =

- multiline_ ? RegExpAssertion::END_OF_LINE

- : RegExpAssertion::END_OF_INPUT;

+ multiline() ? RegExpAssertion::END_OF_LINE

+ : RegExpAssertion::END_OF_INPUT;

builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));

continue;

}

@@ -230,8 +236,9 @@ RegExpTree* RegExpParser::ParseDisjunction() {

ZoneList<CharacterRange>* ranges =

new (zone()) ZoneList<CharacterRange>(2, zone());

CharacterRange::AddClassEscape('.', ranges, zone());

- RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);

- builder->AddAtom(atom);

+ RegExpCharacterClass* cc =

+ new (zone()) RegExpCharacterClass(ranges, false);

+ builder->AddCharacterClass(cc);

break;

}

case '(': {

@@ -276,14 +283,15 @@ RegExpTree* RegExpParser::ParseDisjunction() {

captures_started_++;

}

// Store current state and begin new disjunction parsing.

- state = new (zone()) RegExpParserState(

- state, subexpr_type, lookaround_type, captures_started_, zone());

+ state =

+ new (zone()) RegExpParserState(state, subexpr_type, lookaround_type,

+ captures_started_, flags_, zone());

builder = state->builder();

continue;

}

case '[': {

- RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);

- builder->AddAtom(atom);

+ RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);

+ builder->AddCharacterClass(cc->AsCharacterClass());

break;

}

// Atom ::

@@ -318,8 +326,9 @@ RegExpTree* RegExpParser::ParseDisjunction() {

ZoneList<CharacterRange>* ranges =

new (zone()) ZoneList<CharacterRange>(2, zone());

CharacterRange::AddClassEscape(c, ranges, zone());

- RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);

- builder->AddAtom(atom);

+ RegExpCharacterClass* cc =

+ new (zone()) RegExpCharacterClass(ranges, false);

+ builder->AddCharacterClass(cc);

break;

}

case '1':

@@ -353,7 +362,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {

// escaped,

// no other identity escapes are allowed. If the 'u' flag is not

// present, all identity escapes are allowed.

- if (!unicode_) {

+ if (!unicode()) {

builder->AddCharacter(first_digit);

Advance(2);

} else {

@@ -414,7 +423,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {

uc32 value;

if (ParseHexEscape(2, &value)) {

builder->AddCharacter(value);

- } else if (!unicode_) {

+ } else if (!unicode()) {

builder->AddCharacter('x');

} else {

// If the 'u' flag is present, invalid escapes are not treated as

@@ -428,7 +437,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {

uc32 value;

if (ParseUnicodeEscape(&value)) {

builder->AddUnicodeCharacter(value);

- } else if (!unicode_) {

+ } else if (!unicode()) {

builder->AddCharacter('u');

} else {

// If the 'u' flag is present, invalid escapes are not treated as

@@ -444,7 +453,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {

// other identity escapes are allowed. If the 'u' flag is not

// present,

// all identity escapes are allowed.

- if (!unicode_ || IsSyntaxCharacter(current())) {

+ if (!unicode() || IsSyntaxCharacter(current())) {

builder->AddCharacter(current());

Advance();

} else {

@@ -745,7 +754,7 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) {

// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are

// allowed). In the latter case, the number of hex digits between { } is

// arbitrary. \ and u have already been read.

- if (current() == '{' && unicode_) {

+ if (current() == '{' && unicode()) {

int start = position();

Advance();

if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {

@@ -840,7 +849,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {

if (ParseHexEscape(2, &value)) {

return value;

}

- if (!unicode_) {

+ if (!unicode()) {

// If \x is not followed by a two-digit hexadecimal, treat it

// as an identity escape.

return 'x';

@@ -856,7 +865,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {

if (ParseUnicodeEscape(&value)) {

return value;

}

- if (!unicode_) {

+ if (!unicode()) {

return 'u';

}

// If the 'u' flag is present, invalid escapes are not treated as

@@ -869,7 +878,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {

// If the 'u' flag is present, only syntax characters can be escaped, no

// other identity escapes are allowed. If the 'u' flag is not present, all

// identity escapes are allowed.

- if (!unicode_ || IsSyntaxCharacter(result)) {

+ if (!unicode() || IsSyntaxCharacter(result)) {

Advance();

return result;

}

@@ -899,13 +908,29 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {

case kEndMarker:

return ReportError(CStrVector("\\ at end of pattern"));

default:

- uc32 c = ParseClassCharacterEscape(CHECK_FAILED);

- return CharacterRange::Singleton(c);

+ first = ParseClassCharacterEscape(CHECK_FAILED);

}

} else {

Advance();

- return CharacterRange::Singleton(first);

}

+ if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {

+ // Combine with possibly following trail surrogate.

+ int start = position();

+ uc32 second = current();

+ if (second == '\\') {

+ second = ParseClassCharacterEscape(CHECK_FAILED);

+ } else {

+ Advance();

+ }

+ if (unibrow::Utf16::IsTrailSurrogate(second)) {

+ first = unibrow::Utf16::CombineSurrogatePair(first, second);

+ } else {

+ Reset(start);

+ }

+ return CharacterRange::Singleton(first);

}

@@ -985,10 +1010,10 @@ RegExpTree* RegExpParser::ParseCharacterClass() {

bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,

- FlatStringReader* input, bool multiline,

- bool unicode, RegExpCompileData* result) {

+ FlatStringReader* input, JSRegExp::Flags flags,

+ RegExpCompileData* result) {

DCHECK(result != NULL);

- RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);

+ RegExpParser parser(input, &result->error, flags, isolate, zone);

RegExpTree* tree = parser.ParsePattern();

if (parser.failed()) {

DCHECK(tree == NULL);

@@ -1011,10 +1036,12 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,

}

-RegExpBuilder::RegExpBuilder(Zone* zone)

+RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)

: zone_(zone),

pending_empty_(false),

+ flags_(flags),

characters_(NULL),

+ pending_surrogate_(kNoPendingSurrogate),

terms_(),

alternatives_()

#ifdef DEBUG

@@ -1025,7 +1052,48 @@ RegExpBuilder::RegExpBuilder(Zone* zone)

}

+void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {

+ DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));

+ FlushPendingSurrogate();

+ // Hold onto the lead surrogate, waiting for a trail surrogate to follow.

+ pending_surrogate_ = lead_surrogate;

+void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {

+ DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));

+ if (pending_surrogate_ != kNoPendingSurrogate) {

+ uc16 lead_surrogate = pending_surrogate_;

+ DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));

+ ZoneList<uc16> surrogate_pair(2, zone());

+ surrogate_pair.Add(lead_surrogate, zone());

+ surrogate_pair.Add(trail_surrogate, zone());

+ RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());

+ pending_surrogate_ = kNoPendingSurrogate;

+ AddAtom(atom);

+ } else {

+ pending_surrogate_ = trail_surrogate;

+ FlushPendingSurrogate();

+ }

+void RegExpBuilder::FlushPendingSurrogate() {

+ if (pending_surrogate_ != kNoPendingSurrogate) {

+ // Use character class to desugar lone surrogate matching.

+ RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(

+ CharacterRange::List(zone(),

+ CharacterRange::Singleton(pending_surrogate_)),

+ false);

+ pending_surrogate_ = kNoPendingSurrogate;

+ DCHECK(unicode());

+ AddCharacterClass(cc);

+ }

void RegExpBuilder::FlushCharacters() {

+ FlushPendingSurrogate();

pending_empty_ = false;

if (characters_ != NULL) {

RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());

@@ -1053,6 +1121,7 @@ void RegExpBuilder::FlushText() {

void RegExpBuilder::AddCharacter(uc16 c) {

+ FlushPendingSurrogate();

pending_empty_ = false;

if (characters_ == NULL) {

characters_ = new (zone()) ZoneList<uc16>(4, zone());

@@ -1064,11 +1133,13 @@ void RegExpBuilder::AddCharacter(uc16 c) {

void RegExpBuilder::AddUnicodeCharacter(uc32 c) {

if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {

- ZoneList<uc16> surrogate_pair(2, zone());

- surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone());

- surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone());

- RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());

- AddAtom(atom);

+ DCHECK(unicode());

+ AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));

+ AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));

+ } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {

+ AddLeadSurrogate(c);

+ } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {

+ AddTrailSurrogate(c);

} else {

AddCharacter(static_cast<uc16>(c));

}

@@ -1078,6 +1149,17 @@ void RegExpBuilder::AddUnicodeCharacter(uc32 c) {

void RegExpBuilder::AddEmpty() { pending_empty_ = true; }

+void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {

+ if (unicode() && cc->NeedsDesugaringForUnicode(zone())) {

+ // In unicode mode, character class needs to be desugared, so it

+ // must be a standalone term instead of being part of a RegExpText.

+ AddTerm(cc);

+ } else {

+ AddAtom(cc);

+ }

void RegExpBuilder::AddAtom(RegExpTree* term) {

if (term->IsEmpty()) {

AddEmpty();

@@ -1094,6 +1176,13 @@ void RegExpBuilder::AddAtom(RegExpTree* term) {

}

+void RegExpBuilder::AddTerm(RegExpTree* term) {

+ FlushText();

+ terms_.Add(term, zone());

+ LAST(ADD_ATOM);

void RegExpBuilder::AddAssertion(RegExpTree* assert) {

FlushText();

terms_.Add(assert, zone());

@@ -1132,6 +1221,7 @@ RegExpTree* RegExpBuilder::ToRegExp() {

void RegExpBuilder::AddQuantifierToAtom(

int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {

+ FlushPendingSurrogate();

if (pending_empty_) {

pending_empty_ = false;

return;

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »