src/scanner.cc - Issue 15075: Handling byte-order marks as specified in Ecmascript-262 and in compliance wi...

Keyboard Shortcuts

	File
u :	up to issue
j / k :	jump to file after / before current file
J / K :	jump to next file with a comment after / before current file
	Side-by-side diff
i :	toggle intra-line diffs
e :	expand all comments
c :	collapse all comments
s :	toggle showing all comments
n / p :	next / previous diff chunk or comment
N / P :	next / previous comment
<Up> / <Down> :	next / previous line

	Issue
u :	up to list of issues
j / k :	jump to patch after / before current patch
o / <Enter> :	open current patch in side-by-side view
i :	open current patch in unified diff view

	Issue List
j / k :	jump to issue after / before current issue
o / <Enter> :	open current issue

Unified Diff: src/scanner.cc

Issue 15075: Handling byte-order marks as specified in Ecmascript-262 and in compliance wi... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 12 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/scanner.cc

===================================================================

--- src/scanner.cc (revision 1004)

+++ src/scanner.cc (working copy)

@@ -119,6 +119,18 @@

}

+static inline bool IsByteOrderMark(uc32 c) {

+ // The Unicode value U+FFFE is guaranteed never to be assigned as a

+ // Unicode character; this implies that in a Unicode context the

+ // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF

+ // character expressed in little-endian byte order (since it could

+ // not be a U+FFFE character expressed in big-endian byte

+ // order). Nevertheless, we check for it to be compatible with

+ // Spidermonkey.

+ return c == 0xFEFF || c == 0xFFFE;

uc32 UTF16Buffer::Advance() {

// NOTE: It is of importance to Persian / Farsi resources that we do

// *not* strip format control characters in the scanner; see

@@ -126,16 +138,17 @@

// https://bugzilla.mozilla.org/show_bug.cgi?id=274152

// So, even though ECMA-262, section 7.1, page 11, dictates that we

- // must remove Unicode format-control characters, we do not. This is

- // in line with how IE and SpiderMonkey handles it.

+ // must remove Unicode format-control characters, we only remove the BOM.

+ // This is in line with how Safari handles it.

if (!pushback_buffer()->is_empty()) {

pos_++;

return last_ = pushback_buffer()->RemoveLast();

- } else if (stream_->has_more()) {

- pos_++;

- uc32 next = stream_->GetNext();

- return last_ = next;

} else {

+ while (stream_->has_more()) {

+ pos_++;

+ uc32 next = stream_->GetNext();

+ if (!IsByteOrderMark(next)) return last_ = next;

+ }

// note: currently the following increment is necessary to avoid a

// test-parser problem!

pos_++;

@@ -234,25 +247,11 @@

}

-static inline bool IsByteOrderMark(uc32 c) {

- // The Unicode value U+FFFE is guaranteed never to be assigned as a

- // Unicode character; this implies that in a Unicode context the

- // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF

- // character expressed in little-endian byte order (since it could

- // not be a U+FFFE character expressed in big-endian byte

- // order). Nevertheless, we check for it to be compatible with

- // Spidermonkey.

- return c == 0xFEFF || c == 0xFFFE;

void Scanner::SkipWhiteSpace(bool initial) {

has_line_terminator_before_next_ = initial;

while (true) {

- // We treat byte-order marks (BOMs) as whitespace for better

- // compatibility with Spidermonkey and other JavaScript engines.

- while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) {

+ while (kIsWhiteSpace.get(c0_)) {

// IsWhiteSpace() includes line terminators!

if (kIsLineTerminator.get(c0_))

// Ignore line terminators, but remember them. This is necessary

« no previous file with comments | « no previous file | test/mjsunit/bom.js » ('j') | no next file with comments »