src/scanner-base.cc - Issue 7677012: Make scanner handle invalid unicode escapes in identifiers correctly.

Unified Diff: src/scanner-base.cc

Issue 7677012: Make scanner handle invalid unicode escapes in identifiers correctly. (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Patch Set: Created 9 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/scanner-base.cc

diff --git a/src/scanner-base.cc b/src/scanner-base.cc

index 2ecbfd2a95be8281c3bd38f93c3720ca0df55318..c8a86b21356e916e8a0a8b9dae68d1728b220269 100644

--- a/src/scanner-base.cc

+++ b/src/scanner-base.cc

@@ -41,12 +41,12 @@ Scanner::Scanner(UnicodeCache* unicode_cache)

: unicode_cache_(unicode_cache) { }

-uc32 Scanner::ScanHexEscape(uc32 c, int length) {

- ASSERT(length <= 4); // prevent overflow

+uc32 Scanner::ScanHexNumber(int expected_length) {

+ ASSERT(expected_length <= 4); // prevent overflow

- uc32 digits[4];

+ uc32 digits[4] = { 0, 0, 0, 0 };

uc32 x = 0;

- for (int i = 0; i < length; i++) {

+ for (int i = 0; i < expected_length; i++) {

digits[i] = c0_;

int d = HexValue(c0_);

if (d < 0) {

@@ -54,12 +54,11 @@ uc32 Scanner::ScanHexEscape(uc32 c, int length) {

// should be illegal, but other JS VMs just return the

// non-escaped version of the original character.

- // Push back digits read, except the last one (in c0_).

+ // Push back digits that we have advanced past.

for (int j = i-1; j >= 0; j--) {

PushBack(digits[j]);

}

- // Notice: No handling of error - treat it as "\u"->"u".

- return c;

+ return -1;

}

x = x * 16 + d;

Advance();

@@ -640,9 +639,17 @@ void JavaScriptScanner::ScanEscape() {

case 'n' : c = '\n'; break;

case 'r' : c = '\r'; break;

case 't' : c = '\t'; break;

- case 'u' : c = ScanHexEscape(c, 4); break;

+ case 'u' : {

+ c = ScanHexNumber(4);

+ if (c < 0) c = 'u';

+ break;

+ }

case 'v' : c = '\v'; break;

- case 'x' : c = ScanHexEscape(c, 2); break;

+ case 'x' : {

+ c = ScanHexNumber(2);

+ if (c < 0) c = 'x';

+ break;

+ }

case '0' : // fall through

case '1' : // fall through

case '2' : // fall through

@@ -802,13 +809,11 @@ Token::Value JavaScriptScanner::ScanNumber(bool seen_period) {

uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {

Advance();

- if (c0_ != 'u') return unibrow::Utf8::kBadChar;

+ if (c0_ != 'u') return -1;

Advance();

- uc32 c = ScanHexEscape('u', 4);

- // We do not allow a unicode escape sequence to start another

- // unicode escape sequence.

- if (c == '\\') return unibrow::Utf8::kBadChar;

- return c;

+ uc32 result = ScanHexNumber(4);

+ if (result < 0) PushBack('u');

+ return result;

}

@@ -926,7 +931,11 @@ Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {

if (c0_ == '\\') {

uc32 c = ScanIdentifierUnicodeEscape();

// Only allow legal identifier start characters.

- if (!unicode_cache_->IsIdentifierStart(c)) return Token::ILLEGAL;

+ if (c < 0 ||

+ c == '\\' || // No recursive escapes.

+ !unicode_cache_->IsIdentifierStart(c)) {

+ return Token::ILLEGAL;

+ }

AddLiteralChar(c);

return ScanIdentifierSuffix(&literal);

}

@@ -966,7 +975,11 @@ Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {

if (c0_ == '\\') {

uc32 c = ScanIdentifierUnicodeEscape();

// Only allow legal identifier part characters.

- if (!unicode_cache_->IsIdentifierPart(c)) return Token::ILLEGAL;

+ if (c < 0 ||

+ c == '\\' ||

+ !unicode_cache_->IsIdentifierPart(c)) {

+ return Token::ILLEGAL;

+ }

AddLiteralChar(c);

} else {

AddLiteralChar(c0_);

@@ -992,8 +1005,9 @@ bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {

// the scanner should pass uninterpreted bodies to the RegExp

// constructor.

LiteralScope literal(this);

- if (seen_equal)

+ if (seen_equal) {

AddLiteralChar('=');

+ }

while (c0_ != '/' || in_character_class) {

if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;

@@ -1025,20 +1039,48 @@ bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {

}

+bool JavaScriptScanner::ScanLiteralUnicodeEscape() {

+ ASSERT(c0_ == '\\');

+ uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};

+ Advance();

+ int i = 1;

+ if (c0_ == 'u') {

+ Advance();

+ i++;

+ while (i < 6) {

+ Advance();

+ if (!IsHexDigit(c0_)) break;

+ chars_read[i] = c0_;

+ i++;

+ }

+ if (i < 6) {

+ // Incomplete escape. Undo all advances and return false.

+ while (i > 0) {

+ i--;

+ PushBack(chars_read[i]);

+ }

+ return false;

+ }

+ // Complete escape. Add all chars to current literal buffer.

+ for (int i = 0; i < 6; i++) {

+ AddLiteralChar(chars_read[i]);

+ return true;

Rico 2011/08/18 11:43:13 Indention seems wrong

Lasse Reichstein 2011/08/24 13:36:28 Argh, more than wrong. The return has moved itself

+ }

bool JavaScriptScanner::ScanRegExpFlags() {

// Scan regular expression flags.

LiteralScope literal(this);

while (unicode_cache_->IsIdentifierPart(c0_)) {

- if (c0_ == '\\') {

- uc32 c = ScanIdentifierUnicodeEscape();

- if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {

- // We allow any escaped character, unlike the restriction on

- // IdentifierPart when it is used to build an IdentifierName.

- AddLiteralChar(c);

- continue;

+ if (c0_ != '\\') {

+ AddLiteralCharAdvance();

+ } else {

+ if (!ScanLiteralUnicodeEscape()) {

+ break;

}

- AddLiteralCharAdvance();

}

literal.Complete();

« no previous file with comments | « src/scanner-base.h ('k') | test/mjsunit/regress/regress-1620.js » ('j') | test/mjsunit/regress/regress-1620.js » ('J')