src/scanner.cc - Issue 15075: Handling byte-order marks as specified in Ecmascript-262 and in compliance wi...

Side by Side Diff: src/scanner.cc

Issue 15075: Handling byte-order marks as specified in Ecmascript-262 and in compliance wi... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 12 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved.	1 // Copyright 2006-2008 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 101 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
112 }	112 }

113	113

114	114

115 void UTF16Buffer::PushBack(uc32 ch) {	115 void UTF16Buffer::PushBack(uc32 ch) {

116 pushback_buffer()->Add(last_);	116 pushback_buffer()->Add(last_);

117 last_ = ch;	117 last_ = ch;

118 pos_--;	118 pos_--;

119 }	119 }

120	120

121	121

	122 static inline bool IsByteOrderMark(uc32 c) {

	123 // The Unicode value U+FFFE is guaranteed never to be assigned as a

	124 // Unicode character; this implies that in a Unicode context the

	125 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF

	126 // character expressed in little-endian byte order (since it could

	127 // not be a U+FFFE character expressed in big-endian byte

	128 // order). Nevertheless, we check for it to be compatible with

	129 // Spidermonkey.

	130 return c == 0xFEFF \|\| c == 0xFFFE;

	131 }

	132

	133

122 uc32 UTF16Buffer::Advance() {	134 uc32 UTF16Buffer::Advance() {

123 // NOTE: It is of importance to Persian / Farsi resources that we do	135 // NOTE: It is of importance to Persian / Farsi resources that we do

124 // not strip format control characters in the scanner; see	136 // not strip format control characters in the scanner; see

125 //	137 //

126 // https://bugzilla.mozilla.org/show_bug.cgi?id=274152	138 // https://bugzilla.mozilla.org/show_bug.cgi?id=274152

127 //	139 //

128 // So, even though ECMA-262, section 7.1, page 11, dictates that we	140 // So, even though ECMA-262, section 7.1, page 11, dictates that we

129 // must remove Unicode format-control characters, we do not. This is	141 // must remove Unicode format-control characters, we only remove the BOM.

130 // in line with how IE and SpiderMonkey handles it.	142 // This is in line with how Safari handles it.

131 if (!pushback_buffer()->is_empty()) {	143 if (!pushback_buffer()->is_empty()) {

132 pos_++;	144 pos_++;

133 return last_ = pushback_buffer()->RemoveLast();	145 return last_ = pushback_buffer()->RemoveLast();

134 } else if (stream_->has_more()) {

135 pos_++;

136 uc32 next = stream_->GetNext();

137 return last_ = next;

138 } else {	146 } else {

	147 while (stream_->has_more()) {

	148 pos_++;

	149 uc32 next = stream_->GetNext();

	150 if (!IsByteOrderMark(next)) return last_ = next;

	151 }

139 // note: currently the following increment is necessary to avoid a	152 // note: currently the following increment is necessary to avoid a

140 // test-parser problem!	153 // test-parser problem!

141 pos_++;	154 pos_++;

142 return last_ = static_cast<uc32>(-1);	155 return last_ = static_cast<uc32>(-1);

143 }	156 }

144 }	157 }

145	158

146	159

147 void UTF16Buffer::SeekForward(int pos) {	160 void UTF16Buffer::SeekForward(int pos) {

148 pos_ = pos;	161 pos_ = pos;

(...skipping 78 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
227 c0_ = source_.Advance();	240 c0_ = source_.Advance();

228 }	241 }

229	242

230	243

231 void Scanner::PushBack(uc32 ch) {	244 void Scanner::PushBack(uc32 ch) {

232 source_.PushBack(ch);	245 source_.PushBack(ch);

233 c0_ = ch;	246 c0_ = ch;

234 }	247 }

235	248

236	249

237 static inline bool IsByteOrderMark(uc32 c) {

238 // The Unicode value U+FFFE is guaranteed never to be assigned as a

239 // Unicode character; this implies that in a Unicode context the

240 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF

241 // character expressed in little-endian byte order (since it could

242 // not be a U+FFFE character expressed in big-endian byte

243 // order). Nevertheless, we check for it to be compatible with

244 // Spidermonkey.

245 return c == 0xFEFF \|\| c == 0xFFFE;

246 }

247

248

249 void Scanner::SkipWhiteSpace(bool initial) {	250 void Scanner::SkipWhiteSpace(bool initial) {

250 has_line_terminator_before_next_ = initial;	251 has_line_terminator_before_next_ = initial;

251	252

252 while (true) {	253 while (true) {

253 // We treat byte-order marks (BOMs) as whitespace for better	254 while (kIsWhiteSpace.get(c0_)) {

254 // compatibility with Spidermonkey and other JavaScript engines.

255 while (kIsWhiteSpace.get(c0_) \|\| IsByteOrderMark(c0_)) {

256 // IsWhiteSpace() includes line terminators!	255 // IsWhiteSpace() includes line terminators!

257 if (kIsLineTerminator.get(c0_))	256 if (kIsLineTerminator.get(c0_))

258 // Ignore line terminators, but remember them. This is necessary	257 // Ignore line terminators, but remember them. This is necessary

259 // for automatic semicolon insertion.	258 // for automatic semicolon insertion.

260 has_line_terminator_before_next_ = true;	259 has_line_terminator_before_next_ = true;

261 Advance();	260 Advance();

262 }	261 }

263	262

264 // If there is an HTML comment end '-->' at the beginning of a	263 // If there is an HTML comment end '-->' at the beginning of a

265 // line (with only whitespace in front of it), we treat the rest	264 // line (with only whitespace in front of it), we treat the rest

(...skipping 565 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
831 StartLiteral();	830 StartLiteral();

832 while (kIsIdentifierPart.get(c0_))	831 while (kIsIdentifierPart.get(c0_))

833 AddCharAdvance();	832 AddCharAdvance();

834 TerminateLiteral();	833 TerminateLiteral();

835	834

836 next_.location.end_pos = source_pos() - 1;	835 next_.location.end_pos = source_pos() - 1;

837 return true;	836 return true;

838 }	837 }

839	838

840 } } // namespace v8::internal	839 } } // namespace v8::internal

OLD	NEW

« no previous file with comments | « no previous file | test/mjsunit/bom.js » ('j') | no next file with comments »