regexp2000/src/parser.cc - Issue 8871: Experimental RegExp: changed handling of non-standard escape sequences.

Unified Diff: regexp2000/src/parser.cc

Issue 8871: Experimental RegExp: changed handling of non-standard escape sequences. (Closed)

Patch Set: RegExp escape handling, with review comments. Created 12 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: regexp2000/src/parser.cc

diff --git a/regexp2000/src/parser.cc b/regexp2000/src/parser.cc

index 844a9e447120d95139265715738fdd9cc296b1bc..5b15086d75f9a4e93f6443f1c2a0fb61192fca20 100644

--- a/regexp2000/src/parser.cc

+++ b/regexp2000/src/parser.cc

@@ -230,7 +230,9 @@ class Parser {

class RegExpParser {

public:

- RegExpParser(unibrow::CharacterStream* in, Handle<String>* error);

+ RegExpParser(unibrow::CharacterStream* in,

+ Handle<String>* error,

+ bool multiline_mode);

RegExpTree* ParsePattern(bool* ok);

RegExpTree* ParseDisjunction(bool* ok);

RegExpTree* ParseAlternative(bool* ok);

@@ -247,7 +249,9 @@ class RegExpParser {

// must not be 'b' or 'B' since they are usually handle specially.

uc32 ParseCharacterEscape(bool* ok);

- uc32 ParseHexEscape(int length);

+ // Checks whether the following is a length-digit hexadecimal number,

+ // and sets the value if it is.

+ bool ParseHexEscape(int length, uc32* value);

uc32 ParseControlEscape(bool* ok);

uc32 ParseOctalLiteral(bool* ok);

@@ -262,6 +266,14 @@ class RegExpParser {

RegExpTree* ReportError(Vector<const char> message, bool* ok);

void Advance();

void Advance(int dist);

+ // Pushes a read character (or potentially some other character) back

+ // on the input stream. After pushing it back, it becomes the character

+ // returned by current(). There is a limited amount of push-back buffer.

+ // A function using PushBack should check that it doesn't push back more

+ // than kMaxPushback characters, and it should not push back more characters

+ // than it has read, or that it knows had been read prior to calling it.

+ void PushBack(uc32 character);

+ bool CanPushBack();

static const uc32 kEndMarker = unibrow::Utf8::kBadChar;

private:

uc32 current() { return current_; }

@@ -273,6 +285,7 @@ class RegExpParser {

uc32 next_;

bool has_more_;

bool has_next_;

+ bool multiline_mode_;

int captures_seen_;

unibrow::CharacterStream* in_;

Handle<String>* error_;

@@ -3220,11 +3233,14 @@ Expression* Parser::NewThrowError(Handle<String> constructor,

// Regular expressions

-RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>* error)

+RegExpParser::RegExpParser(unibrow::CharacterStream* in,

+ Handle<String>* error,

+ bool multiline_mode)

: current_(kEndMarker),

next_(kEndMarker),

has_more_(true),

has_next_(true),

+ multiline_mode_(multiline_mode),

captures_seen_(0),

in_(in),

error_(error),

@@ -3255,6 +3271,26 @@ void RegExpParser::Advance(int dist) {

}

+void RegExpParser::PushBack(uc32 character) {

+ if (has_next_) {

+ ASSERT(pushback_count_ < kMaxPushback);

+ pushback_buffer_[pushback_count_] = next_;

+ pushback_count_++;

+ }

+ if (has_more_) {

+ next_ = current_;

+ has_next_ = true;

+ }

+ current_ = character;

+ has_more_ = true;

+bool RegExpParser::CanPushBack() {

+ return (pushback_count_ < kMaxPushback);

RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) {

*ok = false;

*error_ = Factory::NewStringFromAscii(message, NOT_TENURED);

@@ -3358,6 +3394,11 @@ bool RegExpParser::ParseBackreferenceIndex(int* index_out) {

ASSERT_EQ('\\', current());

ASSERT('1' <= next() && next() <= '9');

ASSERT_EQ(0, pushback_count_);

+ // Try to parse a decimal literal that is less than then number

+ // of previously encountered left capturing parentheses.

+ // This is a not according the the ECMAScript specification. According to

+ // that, one must accept values up to the total number of left capturing

+ // parentheses in the entire input, even if they are meaningless.

if (captures_seen_ == 0)

return false;

int value = next() - '0';

@@ -3377,21 +3418,15 @@ bool RegExpParser::ParseBackreferenceIndex(int* index_out) {

if (next_value > captures_seen_ || char_count > kMaxChars) {

// If we give up we have to push the characters we read back

// onto the pushback buffer in the reverse order.

- pushback_buffer_[0] = current();

- for (int i = 0; i < char_count; i++)

- pushback_buffer_[i + 1] = chars_seen[char_count - i - 1];

- pushback_buffer_[char_count + 1] = '\\';

- pushback_count_ = char_count + 2;

- // Then, once we've filled up the buffer, we read the two

- // first characters into the lookahead. This is a roundabout

- // way of doing it but makes the code simpler.

- Advance(2);

+ for (int i = 0; i < char_count; i++) {

+ PushBack(chars_seen[char_count - i - 1]);

+ }

+ PushBack('\\');

return false;

- } else {

- value = next_value;

- chars_seen[char_count++] = current();

- Advance();

}

+ value = next_value;

+ chars_seen[char_count++] = current();

+ Advance();

} else {

*index_out = value;

return true;

@@ -3414,12 +3449,14 @@ RegExpTree* RegExpParser::ParseTerm(bool* ok) {

// \ B

case '^':

Advance();

- // Make the type of assertion dependent on multi/nonmultiline.

- return new RegExpAssertion(RegExpAssertion::START_OF_INPUT);

+ return new RegExpAssertion(

+ multiline_mode_ ? RegExpAssertion::START_OF_LINE

+ : RegExpAssertion::START_OF_INPUT);

case '$':

Advance();

- // Make the type of assertion dependent on multi/nonmultiline.

- return new RegExpAssertion(RegExpAssertion::END_OF_INPUT);

+ return new RegExpAssertion(

+ multiline_mode_ ? RegExpAssertion::END_OF_LINE

+ : RegExpAssertion::END_OF_INPUT);

case '.':

Advance();

atom = new RegExpCharacterClass(CharacterRange::CharacterClass('.'));

@@ -3460,7 +3497,7 @@ RegExpTree* RegExpParser::ParseTerm(bool* ok) {

goto has_read_atom;

} else {

// If this is not a backreference we go to the atom parser

- // which will read it as an octal escape.

+ // which will read it as an octal escape or identity escape.

goto parse_atom;

}

@@ -3591,6 +3628,8 @@ RegExpTree* RegExpParser::ParseAtom(bool* ok) {

return new RegExpAtom(buf->ToConstVector());

}

+// Upper and lower case letters differ by one bit.

+STATIC_CHECK('a'^'A' == 0x20);

uc32 RegExpParser::ParseControlEscape(bool* ok) {

ASSERT(current() == 'c');

@@ -3598,51 +3637,61 @@ uc32 RegExpParser::ParseControlEscape(bool* ok) {

if (!has_more()) {

ReportError(CStrVector("\\c at end of pattern"), ok);

return '\0';

- } else {

- uc32 letter = current();

- if (!('a' <= letter && letter <= 'z') &&

- !('A' <= letter && letter <= 'Z')) {

- ReportError(CStrVector("Illegal control letter"), ok);

- return '\0';

- }

- Advance();

- return letter & ((1 << 5) - 1);

}

+ uc32 letter = current() & ~(0x20); // Collapse upper and lower case letters.

+ if (letter < 'A' || 'Z' < letter) {

+ // Non-spec error-correction: "\c" followed by non-control letter is

+ // interpreted as an IdentityEscape.

+ return 'c';

+ }

+ Advance();

+ return letter & 0x1f; // Remainder modulo 32, per specification.

}

uc32 RegExpParser::ParseOctalLiteral(bool* ok) {

ASSERT('0' <= current() && current() <= '7');

- // Here we're really supposed to break out after the first digit

- // if it is '0' but the other implementations don't do that so

- // neither do we. Is this deviation from the spec error prone?

- // Yes, it's probably as error prone as it's possible to get. Isn't

- // JavaScript wonderful?

- uc32 value = 0;

- while ('0' <= current() && current() <= '7') {

- int next = (8 * value) + (current() - '0');

- if (next >= 256) {

- break;

- } else {

- value = next;

+ // For compatibility with some other browsers (not all), we parse

+ // up to three octal digits with a value below 256.

+ uc32 value = current() - '0';

+ Advance();

+ if ('0' <= current() && current() <= '7') {

+ value = value * 8 + current() - '0';

+ Advance();

+ if (value < 32 && '0' <= current() && current() <= '7') {

+ value = value * 8 + current() - '0';

Advance();

}

return value;

}

-uc32 RegExpParser::ParseHexEscape(int length) {

- uc32 value = 0;

- for (int i = 0; i < length; i++) {

- int d = HexValue(current());

- if (d < 0)

- return value;

- value = value * 16 + d;

+bool RegExpParser::ParseHexEscape(int length, uc32 *value) {

+ static const int kMaxChars = kMaxPushback;

+ EmbeddedVector<uc32, kMaxChars> chars_seen;

+ ASSERT(length <= kMaxChars);

+ uc32 val = 0;

+ bool done = false;

+ for (int i = 0; !done; i++) {

+ uc32 c = current();

+ int d = HexValue(c);

+ if (d < 0) {

+ while (i > 0) {

+ i--;

+ PushBack(chars_seen[i]);

+ }

+ return false;

+ }

+ val = val * 16 + d;

Advance();

+ if (i < length - 1) {

+ chars_seen[i] = c;

+ } else {

+ done = true;

+ }

}

- return value;

+ *value = val;

+ return true;

}

@@ -3670,25 +3719,39 @@ uc32 RegExpParser::ParseCharacterEscape(bool* ok) {

Advance();

return '\v';

case 'c':

+ // Spec mandates that next character is ASCII letter.

+ // If not, we error-correct by interpreting "\c" as "c".

return ParseControlEscape(ok);

case '0': case '1': case '2': case '3': case '4': case '5':

case '6': case '7':

- // We're really supposed to read this as a decimal integer

- // literal which is base 10 but for whatever reason the other

- // implementations read base 8. It's hard to believe that the

- // spec was written by some ofthe same people that wrote the

- // other implementations...

+ // For compatibility, we interpret a decimal escape that isn't

+ // a back reference (and therefore either \0 or not valid according

+ // to the specification) as a 1..3 digit octal character code.

return ParseOctalLiteral(ok);

- case 'x':

+ case 'x': {

Advance();

- return ParseHexEscape(2);

- case 'A': case 'Z': {

- uc32 result = current();

+ uc32 value;

+ if (ParseHexEscape(2, &value)) {

+ return value;

+ }

+ // If \x is not followed by a two-digit hexadecimal, treat it

+ // as an identity escape.

+ return 'x';

+ }

+ case 'u': {

Advance();

- return result;

+ uc32 value;

+ if (ParseHexEscape(4, &value)) {

+ return value;

+ }

+ // If \u is not followed by a four-digit hexadecimal, treat it

+ // as an identity escape.

+ return 'u';

}

default: {

- ASSERT(!Scanner::kIsIdentifierPart.get(current()));

+ // Extended identity escape. We accept any character that hasn't

+ // been matched by a more specific case, not just the subset required

+ // by the ECMAScript specification.

uc32 result = current();

Advance();

return result;

@@ -3847,7 +3910,7 @@ ScriptDataImpl* PreParse(unibrow::CharacterStream* stream,

RegExpTree* ParseRegExp(unibrow::CharacterStream* stream,

Handle<String>* error) {

ASSERT(error->is_null());

- RegExpParser parser(stream, error);

+ RegExpParser parser(stream, error, false); // Get multiline flag somehow

bool ok = true;

RegExpTree* result = parser.ParsePattern(&ok);

if (!ok) {

« no previous file with comments | « no previous file | regexp2000/test/cctest/test-regexp.cc » ('j') | no next file with comments »