Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(576)

Unified Diff: regexp2000/src/parser.cc

Issue 8871: Experimental RegExp: changed handling of non-standard escape sequences. (Closed)
Patch Set: RegExp escape handling, with review comments. Created 12 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | regexp2000/test/cctest/test-regexp.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: regexp2000/src/parser.cc
diff --git a/regexp2000/src/parser.cc b/regexp2000/src/parser.cc
index 844a9e447120d95139265715738fdd9cc296b1bc..5b15086d75f9a4e93f6443f1c2a0fb61192fca20 100644
--- a/regexp2000/src/parser.cc
+++ b/regexp2000/src/parser.cc
@@ -230,7 +230,9 @@ class Parser {
class RegExpParser {
public:
- RegExpParser(unibrow::CharacterStream* in, Handle<String>* error);
+ RegExpParser(unibrow::CharacterStream* in,
+ Handle<String>* error,
+ bool multiline_mode);
RegExpTree* ParsePattern(bool* ok);
RegExpTree* ParseDisjunction(bool* ok);
RegExpTree* ParseAlternative(bool* ok);
@@ -247,7 +249,9 @@ class RegExpParser {
// must not be 'b' or 'B' since they are usually handle specially.
uc32 ParseCharacterEscape(bool* ok);
- uc32 ParseHexEscape(int length);
+ // Checks whether the following is a length-digit hexadecimal number,
+ // and sets the value if it is.
+ bool ParseHexEscape(int length, uc32* value);
uc32 ParseControlEscape(bool* ok);
uc32 ParseOctalLiteral(bool* ok);
@@ -262,6 +266,14 @@ class RegExpParser {
RegExpTree* ReportError(Vector<const char> message, bool* ok);
void Advance();
void Advance(int dist);
+ // Pushes a read character (or potentially some other character) back
+ // on the input stream. After pushing it back, it becomes the character
+ // returned by current(). There is a limited amount of push-back buffer.
+ // A function using PushBack should check that it doesn't push back more
+ // than kMaxPushback characters, and it should not push back more characters
+ // than it has read, or that it knows had been read prior to calling it.
+ void PushBack(uc32 character);
+ bool CanPushBack();
static const uc32 kEndMarker = unibrow::Utf8::kBadChar;
private:
uc32 current() { return current_; }
@@ -273,6 +285,7 @@ class RegExpParser {
uc32 next_;
bool has_more_;
bool has_next_;
+ bool multiline_mode_;
int captures_seen_;
unibrow::CharacterStream* in_;
Handle<String>* error_;
@@ -3220,11 +3233,14 @@ Expression* Parser::NewThrowError(Handle<String> constructor,
// Regular expressions
-RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>* error)
+RegExpParser::RegExpParser(unibrow::CharacterStream* in,
+ Handle<String>* error,
+ bool multiline_mode)
: current_(kEndMarker),
next_(kEndMarker),
has_more_(true),
has_next_(true),
+ multiline_mode_(multiline_mode),
captures_seen_(0),
in_(in),
error_(error),
@@ -3255,6 +3271,26 @@ void RegExpParser::Advance(int dist) {
}
+void RegExpParser::PushBack(uc32 character) {
+ if (has_next_) {
+ ASSERT(pushback_count_ < kMaxPushback);
+ pushback_buffer_[pushback_count_] = next_;
+ pushback_count_++;
+ }
+ if (has_more_) {
+ next_ = current_;
+ has_next_ = true;
+ }
+ current_ = character;
+ has_more_ = true;
+}
+
+
+bool RegExpParser::CanPushBack() {
+ return (pushback_count_ < kMaxPushback);
+}
+
+
RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) {
*ok = false;
*error_ = Factory::NewStringFromAscii(message, NOT_TENURED);
@@ -3358,6 +3394,11 @@ bool RegExpParser::ParseBackreferenceIndex(int* index_out) {
ASSERT_EQ('\\', current());
ASSERT('1' <= next() && next() <= '9');
ASSERT_EQ(0, pushback_count_);
+ // Try to parse a decimal literal that is less than then number
+ // of previously encountered left capturing parentheses.
+ // This is a not according the the ECMAScript specification. According to
+ // that, one must accept values up to the total number of left capturing
+ // parentheses in the entire input, even if they are meaningless.
if (captures_seen_ == 0)
return false;
int value = next() - '0';
@@ -3377,21 +3418,15 @@ bool RegExpParser::ParseBackreferenceIndex(int* index_out) {
if (next_value > captures_seen_ || char_count > kMaxChars) {
// If we give up we have to push the characters we read back
// onto the pushback buffer in the reverse order.
- pushback_buffer_[0] = current();
- for (int i = 0; i < char_count; i++)
- pushback_buffer_[i + 1] = chars_seen[char_count - i - 1];
- pushback_buffer_[char_count + 1] = '\\';
- pushback_count_ = char_count + 2;
- // Then, once we've filled up the buffer, we read the two
- // first characters into the lookahead. This is a roundabout
- // way of doing it but makes the code simpler.
- Advance(2);
+ for (int i = 0; i < char_count; i++) {
+ PushBack(chars_seen[char_count - i - 1]);
+ }
+ PushBack('\\');
return false;
- } else {
- value = next_value;
- chars_seen[char_count++] = current();
- Advance();
}
+ value = next_value;
+ chars_seen[char_count++] = current();
+ Advance();
} else {
*index_out = value;
return true;
@@ -3414,12 +3449,14 @@ RegExpTree* RegExpParser::ParseTerm(bool* ok) {
// \ B
case '^':
Advance();
- // Make the type of assertion dependent on multi/nonmultiline.
- return new RegExpAssertion(RegExpAssertion::START_OF_INPUT);
+ return new RegExpAssertion(
+ multiline_mode_ ? RegExpAssertion::START_OF_LINE
+ : RegExpAssertion::START_OF_INPUT);
case '$':
Advance();
- // Make the type of assertion dependent on multi/nonmultiline.
- return new RegExpAssertion(RegExpAssertion::END_OF_INPUT);
+ return new RegExpAssertion(
+ multiline_mode_ ? RegExpAssertion::END_OF_LINE
+ : RegExpAssertion::END_OF_INPUT);
case '.':
Advance();
atom = new RegExpCharacterClass(CharacterRange::CharacterClass('.'));
@@ -3460,7 +3497,7 @@ RegExpTree* RegExpParser::ParseTerm(bool* ok) {
goto has_read_atom;
} else {
// If this is not a backreference we go to the atom parser
- // which will read it as an octal escape.
+ // which will read it as an octal escape or identity escape.
goto parse_atom;
}
}
@@ -3591,6 +3628,8 @@ RegExpTree* RegExpParser::ParseAtom(bool* ok) {
return new RegExpAtom(buf->ToConstVector());
}
+// Upper and lower case letters differ by one bit.
+STATIC_CHECK('a'^'A' == 0x20);
uc32 RegExpParser::ParseControlEscape(bool* ok) {
ASSERT(current() == 'c');
@@ -3598,51 +3637,61 @@ uc32 RegExpParser::ParseControlEscape(bool* ok) {
if (!has_more()) {
ReportError(CStrVector("\\c at end of pattern"), ok);
return '\0';
- } else {
- uc32 letter = current();
- if (!('a' <= letter && letter <= 'z') &&
- !('A' <= letter && letter <= 'Z')) {
- ReportError(CStrVector("Illegal control letter"), ok);
- return '\0';
- }
- Advance();
- return letter & ((1 << 5) - 1);
}
+ uc32 letter = current() & ~(0x20); // Collapse upper and lower case letters.
+ if (letter < 'A' || 'Z' < letter) {
+ // Non-spec error-correction: "\c" followed by non-control letter is
+ // interpreted as an IdentityEscape.
+ return 'c';
+ }
+ Advance();
+ return letter & 0x1f; // Remainder modulo 32, per specification.
}
uc32 RegExpParser::ParseOctalLiteral(bool* ok) {
ASSERT('0' <= current() && current() <= '7');
- // Here we're really supposed to break out after the first digit
- // if it is '0' but the other implementations don't do that so
- // neither do we. Is this deviation from the spec error prone?
- // Yes, it's probably as error prone as it's possible to get. Isn't
- // JavaScript wonderful?
- uc32 value = 0;
- while ('0' <= current() && current() <= '7') {
- int next = (8 * value) + (current() - '0');
- if (next >= 256) {
- break;
- } else {
- value = next;
+ // For compatibility with some other browsers (not all), we parse
+ // up to three octal digits with a value below 256.
+ uc32 value = current() - '0';
+ Advance();
+ if ('0' <= current() && current() <= '7') {
+ value = value * 8 + current() - '0';
+ Advance();
+ if (value < 32 && '0' <= current() && current() <= '7') {
+ value = value * 8 + current() - '0';
Advance();
}
}
return value;
}
-
-uc32 RegExpParser::ParseHexEscape(int length) {
- uc32 value = 0;
- for (int i = 0; i < length; i++) {
- int d = HexValue(current());
- if (d < 0)
- return value;
- value = value * 16 + d;
+bool RegExpParser::ParseHexEscape(int length, uc32 *value) {
+ static const int kMaxChars = kMaxPushback;
+ EmbeddedVector<uc32, kMaxChars> chars_seen;
+ ASSERT(length <= kMaxChars);
+ uc32 val = 0;
+ bool done = false;
+ for (int i = 0; !done; i++) {
+ uc32 c = current();
+ int d = HexValue(c);
+ if (d < 0) {
+ while (i > 0) {
+ i--;
+ PushBack(chars_seen[i]);
+ }
+ return false;
+ }
+ val = val * 16 + d;
Advance();
+ if (i < length - 1) {
+ chars_seen[i] = c;
+ } else {
+ done = true;
+ }
}
-
- return value;
+ *value = val;
+ return true;
}
@@ -3670,25 +3719,39 @@ uc32 RegExpParser::ParseCharacterEscape(bool* ok) {
Advance();
return '\v';
case 'c':
+ // Spec mandates that next character is ASCII letter.
+ // If not, we error-correct by interpreting "\c" as "c".
return ParseControlEscape(ok);
case '0': case '1': case '2': case '3': case '4': case '5':
case '6': case '7':
- // We're really supposed to read this as a decimal integer
- // literal which is base 10 but for whatever reason the other
- // implementations read base 8. It's hard to believe that the
- // spec was written by some ofthe same people that wrote the
- // other implementations...
+ // For compatibility, we interpret a decimal escape that isn't
+ // a back reference (and therefore either \0 or not valid according
+ // to the specification) as a 1..3 digit octal character code.
return ParseOctalLiteral(ok);
- case 'x':
+ case 'x': {
Advance();
- return ParseHexEscape(2);
- case 'A': case 'Z': {
- uc32 result = current();
+ uc32 value;
+ if (ParseHexEscape(2, &value)) {
+ return value;
+ }
+ // If \x is not followed by a two-digit hexadecimal, treat it
+ // as an identity escape.
+ return 'x';
+ }
+ case 'u': {
Advance();
- return result;
+ uc32 value;
+ if (ParseHexEscape(4, &value)) {
+ return value;
+ }
+ // If \u is not followed by a four-digit hexadecimal, treat it
+ // as an identity escape.
+ return 'u';
}
default: {
- ASSERT(!Scanner::kIsIdentifierPart.get(current()));
+ // Extended identity escape. We accept any character that hasn't
+ // been matched by a more specific case, not just the subset required
+ // by the ECMAScript specification.
uc32 result = current();
Advance();
return result;
@@ -3847,7 +3910,7 @@ ScriptDataImpl* PreParse(unibrow::CharacterStream* stream,
RegExpTree* ParseRegExp(unibrow::CharacterStream* stream,
Handle<String>* error) {
ASSERT(error->is_null());
- RegExpParser parser(stream, error);
+ RegExpParser parser(stream, error, false); // Get multiline flag somehow
bool ok = true;
RegExpTree* result = parser.ParsePattern(&ok);
if (!ok) {
« no previous file with comments | « no previous file | regexp2000/test/cctest/test-regexp.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698