src/regexp/regexp-parser.cc - Issue 1618753002: Revert of [regexp] implement character classes for unicode regexps.

Unified Diff: src/regexp/regexp-parser.cc

Issue 1618753002: Revert of [regexp] implement character classes for unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/regexp/regexp-parser.cc

diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc

index 07d5779675786b0dfbec11fb7a8cf8fa19f3aecb..fa8900342cfc4878411a1c06d753254024f138fe 100644

--- a/src/regexp/regexp-parser.cc

+++ b/src/regexp/regexp-parser.cc

@@ -15,18 +15,20 @@

namespace internal {

RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,

- JSRegExp::Flags flags, Isolate* isolate, Zone* zone)

+ bool multiline, bool unicode, Isolate* isolate,

+ Zone* zone)

: isolate_(isolate),

zone_(zone),

error_(error),

captures_(NULL),

in_(in),

current_(kEndMarker),

- flags_(flags),

next_pos_(0),

captures_started_(0),

capture_count_(0),

has_more_(true),

+ multiline_(multiline),

+ unicode_(unicode),

simple_(false),

contains_anchor_(false),

is_scanned_for_captures_(false),

@@ -35,28 +37,9 @@

}

-template <bool update_position>

-uc32 RegExpParser::ReadNext() {

- int position = next_pos_;

- uc32 c0 = in()->Get(position);

- position++;

- // Read the whole surrogate pair in case of unicode flag, if possible.

- if (unicode() && position < in()->length() &&

- unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {

- uc16 c1 = in()->Get(position);

- if (unibrow::Utf16::IsTrailSurrogate(c1)) {

- c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);

- position++;

- }

- if (update_position) next_pos_ = position;

- return c0;

uc32 RegExpParser::Next() {

if (has_next()) {

- return ReadNext<false>();

+ return in()->Get(next_pos_);

} else {

return kEndMarker;

}

@@ -64,14 +47,25 @@

void RegExpParser::Advance() {

- if (has_next()) {

+ if (next_pos_ < in()->length()) {

StackLimitCheck check(isolate());

if (check.HasOverflowed()) {

ReportError(CStrVector(Isolate::kStackOverflowMessage));

} else if (zone()->excess_allocation()) {

ReportError(CStrVector("Regular expression too large"));

} else {

- current_ = ReadNext<true>();

+ current_ = in()->Get(next_pos_);

+ next_pos_++;

+ // Read the whole surrogate pair in case of unicode flag, if possible.

+ if (unicode_ && next_pos_ < in()->length() &&

+ unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) {

+ uc16 trail = in()->Get(next_pos_);

+ if (unibrow::Utf16::IsTrailSurrogate(trail)) {

+ current_ = unibrow::Utf16::CombineSurrogatePair(

+ static_cast<uc16>(current_), trail);

+ next_pos_++;

+ }

}

} else {

current_ = kEndMarker;

@@ -148,7 +142,7 @@

RegExpTree* RegExpParser::ParseDisjunction() {

// Used to store current state while parsing subexpressions.

RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,

- flags_, zone());

+ zone());

RegExpParserState* state = &initial_state;

// Cache the builder in a local variable for quick access.

RegExpBuilder* builder = initial_state.builder();

@@ -212,7 +206,7 @@

return ReportError(CStrVector("Nothing to repeat"));

case '^': {

Advance();

- if (multiline()) {

+ if (multiline_) {

builder->AddAssertion(

new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));

} else {

@@ -225,8 +219,8 @@

case '$': {

Advance();

RegExpAssertion::AssertionType assertion_type =

- multiline() ? RegExpAssertion::END_OF_LINE

- : RegExpAssertion::END_OF_INPUT;

+ multiline_ ? RegExpAssertion::END_OF_LINE

+ : RegExpAssertion::END_OF_INPUT;

builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));

continue;

}

@@ -236,9 +230,8 @@

ZoneList<CharacterRange>* ranges =

new (zone()) ZoneList<CharacterRange>(2, zone());

CharacterRange::AddClassEscape('.', ranges, zone());

- RegExpCharacterClass* cc =

- new (zone()) RegExpCharacterClass(ranges, false);

- builder->AddCharacterClass(cc);

+ RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);

+ builder->AddAtom(atom);

break;

}

case '(': {

@@ -283,15 +276,14 @@

captures_started_++;

}

// Store current state and begin new disjunction parsing.

- state =

- new (zone()) RegExpParserState(state, subexpr_type, lookaround_type,

- captures_started_, flags_, zone());

+ state = new (zone()) RegExpParserState(

+ state, subexpr_type, lookaround_type, captures_started_, zone());

builder = state->builder();

continue;

}

case '[': {

- RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);

- builder->AddCharacterClass(cc->AsCharacterClass());

+ RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);

+ builder->AddAtom(atom);

break;

}

// Atom ::

@@ -326,9 +318,8 @@

ZoneList<CharacterRange>* ranges =

new (zone()) ZoneList<CharacterRange>(2, zone());

CharacterRange::AddClassEscape(c, ranges, zone());

- RegExpCharacterClass* cc =

- new (zone()) RegExpCharacterClass(ranges, false);

- builder->AddCharacterClass(cc);

+ RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);

+ builder->AddAtom(atom);

break;

}

case '1':

@@ -362,7 +353,7 @@

// escaped,

// no other identity escapes are allowed. If the 'u' flag is not

// present, all identity escapes are allowed.

- if (!unicode()) {

+ if (!unicode_) {

builder->AddCharacter(first_digit);

Advance(2);

} else {

@@ -423,7 +414,7 @@

uc32 value;

if (ParseHexEscape(2, &value)) {

builder->AddCharacter(value);

- } else if (!unicode()) {

+ } else if (!unicode_) {

builder->AddCharacter('x');

} else {

// If the 'u' flag is present, invalid escapes are not treated as

@@ -437,7 +428,7 @@

uc32 value;

if (ParseUnicodeEscape(&value)) {

builder->AddUnicodeCharacter(value);

- } else if (!unicode()) {

+ } else if (!unicode_) {

builder->AddCharacter('u');

} else {

// If the 'u' flag is present, invalid escapes are not treated as

@@ -453,7 +444,7 @@

// other identity escapes are allowed. If the 'u' flag is not

// present,

// all identity escapes are allowed.

- if (!unicode() || IsSyntaxCharacter(current())) {

+ if (!unicode_ || IsSyntaxCharacter(current())) {

builder->AddCharacter(current());

Advance();

} else {

@@ -754,7 +745,7 @@

// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are

// allowed). In the latter case, the number of hex digits between { } is

// arbitrary. \ and u have already been read.

- if (current() == '{' && unicode()) {

+ if (current() == '{' && unicode_) {

int start = position();

Advance();

if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {

@@ -849,7 +840,7 @@

if (ParseHexEscape(2, &value)) {

return value;

}

- if (!unicode()) {

+ if (!unicode_) {

// If \x is not followed by a two-digit hexadecimal, treat it

// as an identity escape.

return 'x';

@@ -865,7 +856,7 @@

if (ParseUnicodeEscape(&value)) {

return value;

}

- if (!unicode()) {

+ if (!unicode_) {

return 'u';

}

// If the 'u' flag is present, invalid escapes are not treated as

@@ -878,7 +869,7 @@

// If the 'u' flag is present, only syntax characters can be escaped, no

// other identity escapes are allowed. If the 'u' flag is not present, all

// identity escapes are allowed.

- if (!unicode() || IsSyntaxCharacter(result)) {

+ if (!unicode_ || IsSyntaxCharacter(result)) {

Advance();

return result;

}

@@ -908,29 +899,13 @@

case kEndMarker:

return ReportError(CStrVector("\\ at end of pattern"));

default:

- first = ParseClassCharacterEscape(CHECK_FAILED);

+ uc32 c = ParseClassCharacterEscape(CHECK_FAILED);

+ return CharacterRange::Singleton(c);

}

} else {

Advance();

- }

- if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {

- // Combine with possibly following trail surrogate.

- int start = position();

- uc32 second = current();

- if (second == '\\') {

- second = ParseClassCharacterEscape(CHECK_FAILED);

- } else {

- Advance();

- }

- if (unibrow::Utf16::IsTrailSurrogate(second)) {

- first = unibrow::Utf16::CombineSurrogatePair(first, second);

- } else {

- Reset(start);

- }

- return CharacterRange::Singleton(first);

+ return CharacterRange::Singleton(first);

+ }

}

@@ -1010,10 +985,10 @@

bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,

- FlatStringReader* input, JSRegExp::Flags flags,

- RegExpCompileData* result) {

+ FlatStringReader* input, bool multiline,

+ bool unicode, RegExpCompileData* result) {

DCHECK(result != NULL);

- RegExpParser parser(input, &result->error, flags, isolate, zone);

+ RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);

RegExpTree* tree = parser.ParsePattern();

if (parser.failed()) {

DCHECK(tree == NULL);

@@ -1036,12 +1011,10 @@

}

-RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)

+RegExpBuilder::RegExpBuilder(Zone* zone)

: zone_(zone),

pending_empty_(false),

- flags_(flags),

characters_(NULL),

- pending_surrogate_(kNoPendingSurrogate),

terms_(),

alternatives_()

#ifdef DEBUG

@@ -1052,48 +1025,7 @@

}

-void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {

- DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));

- FlushPendingSurrogate();

- // Hold onto the lead surrogate, waiting for a trail surrogate to follow.

- pending_surrogate_ = lead_surrogate;

-void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {

- DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));

- if (pending_surrogate_ != kNoPendingSurrogate) {

- uc16 lead_surrogate = pending_surrogate_;

- DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));

- ZoneList<uc16> surrogate_pair(2, zone());

- surrogate_pair.Add(lead_surrogate, zone());

- surrogate_pair.Add(trail_surrogate, zone());

- RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());

- pending_surrogate_ = kNoPendingSurrogate;

- AddAtom(atom);

- } else {

- pending_surrogate_ = trail_surrogate;

- FlushPendingSurrogate();

- }

-void RegExpBuilder::FlushPendingSurrogate() {

- if (pending_surrogate_ != kNoPendingSurrogate) {

- // Use character class to desugar lone surrogate matching.

- RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(

- CharacterRange::List(zone(),

- CharacterRange::Singleton(pending_surrogate_)),

- false);

- pending_surrogate_ = kNoPendingSurrogate;

- DCHECK(unicode());

- AddCharacterClass(cc);

- }

void RegExpBuilder::FlushCharacters() {

- FlushPendingSurrogate();

pending_empty_ = false;

if (characters_ != NULL) {

RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());

@@ -1121,7 +1053,6 @@

void RegExpBuilder::AddCharacter(uc16 c) {

- FlushPendingSurrogate();

pending_empty_ = false;

if (characters_ == NULL) {

characters_ = new (zone()) ZoneList<uc16>(4, zone());

@@ -1133,13 +1064,11 @@

void RegExpBuilder::AddUnicodeCharacter(uc32 c) {

if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {

- DCHECK(unicode());

- AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));

- AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));

- } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {

- AddLeadSurrogate(c);

- } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {

- AddTrailSurrogate(c);

+ ZoneList<uc16> surrogate_pair(2, zone());

+ surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone());

+ surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone());

+ RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());

+ AddAtom(atom);

} else {

AddCharacter(static_cast<uc16>(c));

}

@@ -1147,17 +1076,6 @@

void RegExpBuilder::AddEmpty() { pending_empty_ = true; }

-void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {

- if (unicode() && cc->NeedsDesugaringForUnicode(zone())) {

- // In unicode mode, character class needs to be desugared, so it

- // must be a standalone term instead of being part of a RegExpText.

- AddTerm(cc);

- } else {

- AddAtom(cc);

- }

void RegExpBuilder::AddAtom(RegExpTree* term) {

@@ -1172,13 +1090,6 @@

FlushText();

terms_.Add(term, zone());

}

- LAST(ADD_ATOM);

-void RegExpBuilder::AddTerm(RegExpTree* term) {

- FlushText();

- terms_.Add(term, zone());

LAST(ADD_ATOM);

}

@@ -1221,7 +1132,6 @@

void RegExpBuilder::AddQuantifierToAtom(

int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {

- FlushPendingSurrogate();

if (pending_empty_) {

pending_empty_ = false;

return;

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »