Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(38)

Side by Side Diff: src/scanner.cc

Issue 15075: Handling byte-order marks as specified in Ecmascript-262 and in compliance wi... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 12 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | test/mjsunit/bom.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after
112 } 112 }
113 113
114 114
115 void UTF16Buffer::PushBack(uc32 ch) { 115 void UTF16Buffer::PushBack(uc32 ch) {
116 pushback_buffer()->Add(last_); 116 pushback_buffer()->Add(last_);
117 last_ = ch; 117 last_ = ch;
118 pos_--; 118 pos_--;
119 } 119 }
120 120
121 121
122 static inline bool IsByteOrderMark(uc32 c) {
123 // The Unicode value U+FFFE is guaranteed never to be assigned as a
124 // Unicode character; this implies that in a Unicode context the
125 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
126 // character expressed in little-endian byte order (since it could
127 // not be a U+FFFE character expressed in big-endian byte
128 // order). Nevertheless, we check for it to be compatible with
129 // Spidermonkey.
130 return c == 0xFEFF || c == 0xFFFE;
131 }
132
133
122 uc32 UTF16Buffer::Advance() { 134 uc32 UTF16Buffer::Advance() {
123 // NOTE: It is of importance to Persian / Farsi resources that we do 135 // NOTE: It is of importance to Persian / Farsi resources that we do
124 // *not* strip format control characters in the scanner; see 136 // *not* strip format control characters in the scanner; see
125 // 137 //
126 // https://bugzilla.mozilla.org/show_bug.cgi?id=274152 138 // https://bugzilla.mozilla.org/show_bug.cgi?id=274152
127 // 139 //
128 // So, even though ECMA-262, section 7.1, page 11, dictates that we 140 // So, even though ECMA-262, section 7.1, page 11, dictates that we
129 // must remove Unicode format-control characters, we do not. This is 141 // must remove Unicode format-control characters, we only remove the BOM.
130 // in line with how IE and SpiderMonkey handles it. 142 // This is in line with how Safari handles it.
131 if (!pushback_buffer()->is_empty()) { 143 if (!pushback_buffer()->is_empty()) {
132 pos_++; 144 pos_++;
133 return last_ = pushback_buffer()->RemoveLast(); 145 return last_ = pushback_buffer()->RemoveLast();
134 } else if (stream_->has_more()) {
135 pos_++;
136 uc32 next = stream_->GetNext();
137 return last_ = next;
138 } else { 146 } else {
147 while (stream_->has_more()) {
148 pos_++;
149 uc32 next = stream_->GetNext();
150 if (!IsByteOrderMark(next)) return last_ = next;
151 }
139 // note: currently the following increment is necessary to avoid a 152 // note: currently the following increment is necessary to avoid a
140 // test-parser problem! 153 // test-parser problem!
141 pos_++; 154 pos_++;
142 return last_ = static_cast<uc32>(-1); 155 return last_ = static_cast<uc32>(-1);
143 } 156 }
144 } 157 }
145 158
146 159
147 void UTF16Buffer::SeekForward(int pos) { 160 void UTF16Buffer::SeekForward(int pos) {
148 pos_ = pos; 161 pos_ = pos;
(...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after
227 c0_ = source_.Advance(); 240 c0_ = source_.Advance();
228 } 241 }
229 242
230 243
231 void Scanner::PushBack(uc32 ch) { 244 void Scanner::PushBack(uc32 ch) {
232 source_.PushBack(ch); 245 source_.PushBack(ch);
233 c0_ = ch; 246 c0_ = ch;
234 } 247 }
235 248
236 249
237 static inline bool IsByteOrderMark(uc32 c) {
238 // The Unicode value U+FFFE is guaranteed never to be assigned as a
239 // Unicode character; this implies that in a Unicode context the
240 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
241 // character expressed in little-endian byte order (since it could
242 // not be a U+FFFE character expressed in big-endian byte
243 // order). Nevertheless, we check for it to be compatible with
244 // Spidermonkey.
245 return c == 0xFEFF || c == 0xFFFE;
246 }
247
248
249 void Scanner::SkipWhiteSpace(bool initial) { 250 void Scanner::SkipWhiteSpace(bool initial) {
250 has_line_terminator_before_next_ = initial; 251 has_line_terminator_before_next_ = initial;
251 252
252 while (true) { 253 while (true) {
253 // We treat byte-order marks (BOMs) as whitespace for better 254 while (kIsWhiteSpace.get(c0_)) {
254 // compatibility with Spidermonkey and other JavaScript engines.
255 while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) {
256 // IsWhiteSpace() includes line terminators! 255 // IsWhiteSpace() includes line terminators!
257 if (kIsLineTerminator.get(c0_)) 256 if (kIsLineTerminator.get(c0_))
258 // Ignore line terminators, but remember them. This is necessary 257 // Ignore line terminators, but remember them. This is necessary
259 // for automatic semicolon insertion. 258 // for automatic semicolon insertion.
260 has_line_terminator_before_next_ = true; 259 has_line_terminator_before_next_ = true;
261 Advance(); 260 Advance();
262 } 261 }
263 262
264 // If there is an HTML comment end '-->' at the beginning of a 263 // If there is an HTML comment end '-->' at the beginning of a
265 // line (with only whitespace in front of it), we treat the rest 264 // line (with only whitespace in front of it), we treat the rest
(...skipping 565 matching lines...) Expand 10 before | Expand all | Expand 10 after
831 StartLiteral(); 830 StartLiteral();
832 while (kIsIdentifierPart.get(c0_)) 831 while (kIsIdentifierPart.get(c0_))
833 AddCharAdvance(); 832 AddCharAdvance();
834 TerminateLiteral(); 833 TerminateLiteral();
835 834
836 next_.location.end_pos = source_pos() - 1; 835 next_.location.end_pos = source_pos() - 1;
837 return true; 836 return true;
838 } 837 }
839 838
840 } } // namespace v8::internal 839 } } // namespace v8::internal
OLDNEW
« no previous file with comments | « no previous file | test/mjsunit/bom.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698