Index: src/scanner.cc |
=================================================================== |
--- src/scanner.cc (revision 1004) |
+++ src/scanner.cc (working copy) |
@@ -119,6 +119,18 @@ |
} |
+static inline bool IsByteOrderMark(uc32 c) { |
+ // The Unicode value U+FFFE is guaranteed never to be assigned as a |
+ // Unicode character; this implies that in a Unicode context the |
+ // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF |
+ // character expressed in little-endian byte order (since it could |
+ // not be a U+FFFE character expressed in big-endian byte |
+ // order). Nevertheless, we check for it to be compatible with |
+ // Spidermonkey. |
+ return c == 0xFEFF || c == 0xFFFE; |
+} |
+ |
+ |
uc32 UTF16Buffer::Advance() { |
// NOTE: It is of importance to Persian / Farsi resources that we do |
// *not* strip format control characters in the scanner; see |
@@ -126,16 +138,17 @@ |
// https://bugzilla.mozilla.org/show_bug.cgi?id=274152 |
// |
// So, even though ECMA-262, section 7.1, page 11, dictates that we |
- // must remove Unicode format-control characters, we do not. This is |
- // in line with how IE and SpiderMonkey handles it. |
+ // must remove Unicode format-control characters, we only remove the BOM. |
+ // This is in line with how Safari handles it. |
if (!pushback_buffer()->is_empty()) { |
pos_++; |
return last_ = pushback_buffer()->RemoveLast(); |
- } else if (stream_->has_more()) { |
- pos_++; |
- uc32 next = stream_->GetNext(); |
- return last_ = next; |
} else { |
+ while (stream_->has_more()) { |
+ pos_++; |
+ uc32 next = stream_->GetNext(); |
+ if (!IsByteOrderMark(next)) return last_ = next; |
+ } |
// note: currently the following increment is necessary to avoid a |
// test-parser problem! |
pos_++; |
@@ -234,25 +247,11 @@ |
} |
-static inline bool IsByteOrderMark(uc32 c) { |
- // The Unicode value U+FFFE is guaranteed never to be assigned as a |
- // Unicode character; this implies that in a Unicode context the |
- // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF |
- // character expressed in little-endian byte order (since it could |
- // not be a U+FFFE character expressed in big-endian byte |
- // order). Nevertheless, we check for it to be compatible with |
- // Spidermonkey. |
- return c == 0xFEFF || c == 0xFFFE; |
-} |
- |
- |
void Scanner::SkipWhiteSpace(bool initial) { |
has_line_terminator_before_next_ = initial; |
while (true) { |
- // We treat byte-order marks (BOMs) as whitespace for better |
- // compatibility with Spidermonkey and other JavaScript engines. |
- while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) { |
+ while (kIsWhiteSpace.get(c0_)) { |
// IsWhiteSpace() includes line terminators! |
if (kIsLineTerminator.get(c0_)) |
// Ignore line terminators, but remember them. This is necessary |