OLD | NEW |
1 // Copyright 2010 the V8 project authors. All rights reserved. | 1 // Copyright 2010 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 18 matching lines...) Expand all Loading... |
29 | 29 |
30 #include "ast.h" | 30 #include "ast.h" |
31 #include "handles.h" | 31 #include "handles.h" |
32 #include "scanner.h" | 32 #include "scanner.h" |
33 #include "unicode-inl.h" | 33 #include "unicode-inl.h" |
34 | 34 |
35 namespace v8 { | 35 namespace v8 { |
36 namespace internal { | 36 namespace internal { |
37 | 37 |
38 // ---------------------------------------------------------------------------- | 38 // ---------------------------------------------------------------------------- |
39 // UTF8Buffer | |
40 | |
41 UTF8Buffer::UTF8Buffer() : buffer_(kInitialCapacity), recording_(false) { } | |
42 | |
43 | |
44 UTF8Buffer::~UTF8Buffer() {} | |
45 | |
46 | |
47 void UTF8Buffer::AddCharSlow(uc32 c) { | |
48 ASSERT(static_cast<unsigned>(c) > unibrow::Utf8::kMaxOneByteChar); | |
49 int length = unibrow::Utf8::Length(c); | |
50 Vector<char> block = buffer_.AddBlock(length, '\0'); | |
51 #ifdef DEBUG | |
52 int written_length = unibrow::Utf8::Encode(block.start(), c); | |
53 CHECK_EQ(length, written_length); | |
54 #else | |
55 unibrow::Utf8::Encode(block.start(), c); | |
56 #endif | |
57 } | |
58 | |
59 | |
60 // ---------------------------------------------------------------------------- | |
61 // UTF16Buffer | 39 // UTF16Buffer |
62 | 40 |
63 | |
64 UTF16Buffer::UTF16Buffer() | |
65 : pos_(0), end_(Scanner::kNoEndPosition) { } | |
66 | |
67 | |
68 // CharacterStreamUTF16Buffer | 41 // CharacterStreamUTF16Buffer |
69 CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer() | 42 CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer() |
70 : pushback_buffer_(0), last_(0), stream_(NULL) { } | 43 : pushback_buffer_(0), last_(0), stream_(NULL) { } |
71 | 44 |
72 | 45 |
73 void CharacterStreamUTF16Buffer::Initialize(Handle<String> data, | 46 void CharacterStreamUTF16Buffer::Initialize(Handle<String> data, |
74 unibrow::CharacterStream* input, | 47 unibrow::CharacterStream* input, |
75 int start_position, | 48 int start_position, |
76 int end_position) { | 49 int end_position) { |
77 stream_ = input; | 50 stream_ = input; |
78 if (start_position > 0) { | 51 if (start_position > 0) { |
79 SeekForward(start_position); | 52 SeekForward(start_position); |
80 } | 53 } |
81 end_ = end_position != Scanner::kNoEndPosition ? end_position : kMaxInt; | 54 end_ = end_position != kNoEndPosition ? end_position : kMaxInt; |
82 } | 55 } |
83 | 56 |
84 | 57 |
85 void CharacterStreamUTF16Buffer::PushBack(uc32 ch) { | 58 void CharacterStreamUTF16Buffer::PushBack(uc32 ch) { |
86 pushback_buffer()->Add(last_); | 59 pushback_buffer()->Add(last_); |
87 last_ = ch; | 60 last_ = ch; |
88 pos_--; | 61 pos_--; |
89 } | 62 } |
90 | 63 |
91 | 64 |
92 uc32 CharacterStreamUTF16Buffer::Advance() { | 65 uc32 CharacterStreamUTF16Buffer::Advance() { |
93 ASSERT(end_ != Scanner::kNoEndPosition); | 66 ASSERT(end_ != kNoEndPosition); |
94 ASSERT(end_ >= 0); | 67 ASSERT(end_ >= 0); |
95 // NOTE: It is of importance to Persian / Farsi resources that we do | 68 // NOTE: It is of importance to Persian / Farsi resources that we do |
96 // *not* strip format control characters in the scanner; see | 69 // *not* strip format control characters in the scanner; see |
97 // | 70 // |
98 // https://bugzilla.mozilla.org/show_bug.cgi?id=274152 | 71 // https://bugzilla.mozilla.org/show_bug.cgi?id=274152 |
99 // | 72 // |
100 // So, even though ECMA-262, section 7.1, page 11, dictates that we | 73 // So, even though ECMA-262, section 7.1, page 11, dictates that we |
101 // must remove Unicode format-control characters, we do not. This is | 74 // must remove Unicode format-control characters, we do not. This is |
102 // in line with how IE and SpiderMonkey handles it. | 75 // in line with how IE and SpiderMonkey handles it. |
103 if (!pushback_buffer()->is_empty()) { | 76 if (!pushback_buffer()->is_empty()) { |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
136 if (!complete_) scanner_->DropLiteral(); | 109 if (!complete_) scanner_->DropLiteral(); |
137 } | 110 } |
138 | 111 |
139 | 112 |
140 void Scanner::LiteralScope::Complete() { | 113 void Scanner::LiteralScope::Complete() { |
141 scanner_->TerminateLiteral(); | 114 scanner_->TerminateLiteral(); |
142 complete_ = true; | 115 complete_ = true; |
143 } | 116 } |
144 | 117 |
145 // ---------------------------------------------------------------------------- | 118 // ---------------------------------------------------------------------------- |
146 // Scanner | 119 // V8JavaScriptScanner |
147 | 120 |
148 Scanner::Scanner() | 121 void V8JavaScriptScanner::Initialize(Handle<String> source) { |
149 : has_line_terminator_before_next_(false), | 122 source_ = stream_initializer_.Init(source, NULL, 0, source->length()); |
150 is_parsing_json_(false), | 123 Init(); |
151 source_(NULL), | 124 // Skip initial whitespace allowing HTML comment ends just like |
152 stack_overflow_(false) {} | 125 // after a newline and scan first token. |
153 | 126 has_line_terminator_before_next_ = true; |
154 | 127 SkipWhiteSpace(); |
155 void Scanner::Initialize(Handle<String> source, | 128 Scan(); |
156 ParserLanguage language) { | |
157 Init(source, NULL, 0, source->length(), language); | |
158 } | 129 } |
159 | 130 |
160 | 131 |
161 void Scanner::Initialize(Handle<String> source, | 132 void V8JavaScriptScanner::Initialize(Handle<String> source, |
162 unibrow::CharacterStream* stream, | 133 unibrow::CharacterStream* stream) { |
163 ParserLanguage language) { | 134 source_ = stream_initializer_.Init(source, stream, |
164 Init(source, stream, 0, kNoEndPosition, language); | 135 0, UTF16Buffer::kNoEndPosition); |
| 136 Init(); |
| 137 // Skip initial whitespace allowing HTML comment ends just like |
| 138 // after a newline and scan first token. |
| 139 has_line_terminator_before_next_ = true; |
| 140 SkipWhiteSpace(); |
| 141 Scan(); |
165 } | 142 } |
166 | 143 |
167 | 144 |
168 void Scanner::Initialize(Handle<String> source, | 145 void V8JavaScriptScanner::Initialize(Handle<String> source, |
169 int start_position, | 146 int start_position, |
170 int end_position, | 147 int end_position) { |
171 ParserLanguage language) { | 148 source_ = stream_initializer_.Init(source, NULL, |
172 Init(source, NULL, start_position, end_position, language); | 149 start_position, end_position); |
| 150 Init(); |
| 151 // Skip initial whitespace allowing HTML comment ends just like |
| 152 // after a newline and scan first token. |
| 153 has_line_terminator_before_next_ = true; |
| 154 SkipWhiteSpace(); |
| 155 Scan(); |
173 } | 156 } |
174 | 157 |
175 | 158 |
176 void Scanner::Init(Handle<String> source, | 159 Token::Value V8JavaScriptScanner::NextCheckStack() { |
177 unibrow::CharacterStream* stream, | 160 // BUG 1215673: Find a thread safe way to set a stack limit in |
178 int start_position, | 161 // pre-parse mode. Otherwise, we cannot safely pre-parse from other |
179 int end_position, | 162 // threads. |
180 ParserLanguage language) { | 163 StackLimitCheck check; |
| 164 if (check.HasOverflowed()) { |
| 165 stack_overflow_ = true; |
| 166 current_ = next_; |
| 167 next_.token = Token::ILLEGAL; |
| 168 return current_.token; |
| 169 } else { |
| 170 return Next(); |
| 171 } |
| 172 } |
| 173 |
| 174 |
| 175 UTF16Buffer* StreamInitializer::Init(Handle<String> source, |
| 176 unibrow::CharacterStream* stream, |
| 177 int start_position, |
| 178 int end_position) { |
181 // Either initialize the scanner from a character stream or from a | 179 // Either initialize the scanner from a character stream or from a |
182 // string. | 180 // string. |
183 ASSERT(source.is_null() || stream == NULL); | 181 ASSERT(source.is_null() || stream == NULL); |
184 | 182 |
185 // Initialize the source buffer. | 183 // Initialize the source buffer. |
186 if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) { | 184 if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) { |
187 two_byte_string_buffer_.Initialize( | 185 two_byte_string_buffer_.Initialize( |
188 Handle<ExternalTwoByteString>::cast(source), | 186 Handle<ExternalTwoByteString>::cast(source), |
189 start_position, | 187 start_position, |
190 end_position); | 188 end_position); |
191 source_ = &two_byte_string_buffer_; | 189 return &two_byte_string_buffer_; |
192 } else if (!source.is_null() && StringShape(*source).IsExternalAscii()) { | 190 } else if (!source.is_null() && StringShape(*source).IsExternalAscii()) { |
193 ascii_string_buffer_.Initialize( | 191 ascii_string_buffer_.Initialize( |
194 Handle<ExternalAsciiString>::cast(source), | 192 Handle<ExternalAsciiString>::cast(source), |
195 start_position, | 193 start_position, |
196 end_position); | 194 end_position); |
197 source_ = &ascii_string_buffer_; | 195 return &ascii_string_buffer_; |
198 } else { | 196 } else { |
199 if (!source.is_null()) { | 197 if (!source.is_null()) { |
200 safe_string_input_buffer_.Reset(source.location()); | 198 safe_string_input_buffer_.Reset(source.location()); |
201 stream = &safe_string_input_buffer_; | 199 stream = &safe_string_input_buffer_; |
202 } | 200 } |
203 char_stream_buffer_.Initialize(source, | 201 char_stream_buffer_.Initialize(source, |
204 stream, | 202 stream, |
205 start_position, | 203 start_position, |
206 end_position); | 204 end_position); |
207 source_ = &char_stream_buffer_; | 205 return &char_stream_buffer_; |
208 } | 206 } |
| 207 } |
209 | 208 |
210 is_parsing_json_ = (language == JSON); | 209 // ---------------------------------------------------------------------------- |
| 210 // JsonScanner |
211 | 211 |
212 // Set c0_ (one character ahead) | 212 JsonScanner::JsonScanner() {} |
213 ASSERT(kCharacterLookaheadBufferSize == 1); | |
214 Advance(); | |
215 // Initialize current_ to not refer to a literal. | |
216 current_.literal_chars = Vector<const char>(); | |
217 // Reset literal buffer. | |
218 literal_buffer_.Reset(); | |
219 | 213 |
220 // Skip initial whitespace allowing HTML comment ends just like | 214 |
221 // after a newline and scan first token. | 215 void JsonScanner::Initialize(Handle<String> source) { |
222 has_line_terminator_before_next_ = true; | 216 source_ = stream_initializer_.Init(source, NULL, 0, source->length()); |
223 SkipWhiteSpace(); | 217 Init(); |
224 Scan(); | 218 // Skip initial whitespace. |
| 219 SkipJsonWhiteSpace(); |
| 220 // Preload first token as look-ahead. |
| 221 ScanJson(); |
225 } | 222 } |
226 | 223 |
227 | 224 |
228 Token::Value Scanner::Next() { | 225 Token::Value JsonScanner::Next() { |
229 // BUG 1215673: Find a thread safe way to set a stack limit in | 226 // BUG 1215673: Find a thread safe way to set a stack limit in |
230 // pre-parse mode. Otherwise, we cannot safely pre-parse from other | 227 // pre-parse mode. Otherwise, we cannot safely pre-parse from other |
231 // threads. | 228 // threads. |
232 current_ = next_; | 229 current_ = next_; |
233 // Check for stack-overflow before returning any tokens. | 230 // Check for stack-overflow before returning any tokens. |
234 StackLimitCheck check; | 231 StackLimitCheck check; |
235 if (check.HasOverflowed()) { | 232 if (check.HasOverflowed()) { |
236 stack_overflow_ = true; | 233 stack_overflow_ = true; |
237 next_.token = Token::ILLEGAL; | 234 next_.token = Token::ILLEGAL; |
238 } else { | 235 } else { |
239 has_line_terminator_before_next_ = false; | 236 ScanJson(); |
240 Scan(); | |
241 } | 237 } |
242 return current_.token; | 238 return current_.token; |
243 } | 239 } |
244 | 240 |
245 | 241 |
246 void Scanner::StartLiteral() { | 242 bool JsonScanner::SkipJsonWhiteSpace() { |
247 literal_buffer_.StartLiteral(); | |
248 } | |
249 | |
250 | |
251 void Scanner::AddLiteralChar(uc32 c) { | |
252 literal_buffer_.AddChar(c); | |
253 } | |
254 | |
255 | |
256 void Scanner::TerminateLiteral() { | |
257 next_.literal_chars = literal_buffer_.EndLiteral(); | |
258 } | |
259 | |
260 | |
261 void Scanner::DropLiteral() { | |
262 literal_buffer_.DropLiteral(); | |
263 } | |
264 | |
265 | |
266 void Scanner::AddLiteralCharAdvance() { | |
267 AddLiteralChar(c0_); | |
268 Advance(); | |
269 } | |
270 | |
271 | |
272 static inline bool IsByteOrderMark(uc32 c) { | |
273 // The Unicode value U+FFFE is guaranteed never to be assigned as a | |
274 // Unicode character; this implies that in a Unicode context the | |
275 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF | |
276 // character expressed in little-endian byte order (since it could | |
277 // not be a U+FFFE character expressed in big-endian byte | |
278 // order). Nevertheless, we check for it to be compatible with | |
279 // Spidermonkey. | |
280 return c == 0xFEFF || c == 0xFFFE; | |
281 } | |
282 | |
283 | |
284 bool Scanner::SkipJsonWhiteSpace() { | |
285 int start_position = source_pos(); | 243 int start_position = source_pos(); |
286 // JSON WhiteSpace is tab, carrige-return, newline and space. | 244 // JSON WhiteSpace is tab, carrige-return, newline and space. |
287 while (c0_ == ' ' || c0_ == '\n' || c0_ == '\r' || c0_ == '\t') { | 245 while (c0_ == ' ' || c0_ == '\n' || c0_ == '\r' || c0_ == '\t') { |
288 Advance(); | 246 Advance(); |
289 } | 247 } |
290 return source_pos() != start_position; | 248 return source_pos() != start_position; |
291 } | 249 } |
292 | 250 |
293 | 251 |
294 bool Scanner::SkipJavaScriptWhiteSpace() { | 252 void JsonScanner::ScanJson() { |
295 int start_position = source_pos(); | |
296 | |
297 while (true) { | |
298 // We treat byte-order marks (BOMs) as whitespace for better | |
299 // compatibility with Spidermonkey and other JavaScript engines. | |
300 while (ScannerConstants::kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) { | |
301 // IsWhiteSpace() includes line terminators! | |
302 if (ScannerConstants::kIsLineTerminator.get(c0_)) { | |
303 // Ignore line terminators, but remember them. This is necessary | |
304 // for automatic semicolon insertion. | |
305 has_line_terminator_before_next_ = true; | |
306 } | |
307 Advance(); | |
308 } | |
309 | |
310 // If there is an HTML comment end '-->' at the beginning of a | |
311 // line (with only whitespace in front of it), we treat the rest | |
312 // of the line as a comment. This is in line with the way | |
313 // SpiderMonkey handles it. | |
314 if (c0_ == '-' && has_line_terminator_before_next_) { | |
315 Advance(); | |
316 if (c0_ == '-') { | |
317 Advance(); | |
318 if (c0_ == '>') { | |
319 // Treat the rest of the line as a comment. | |
320 SkipSingleLineComment(); | |
321 // Continue skipping white space after the comment. | |
322 continue; | |
323 } | |
324 PushBack('-'); // undo Advance() | |
325 } | |
326 PushBack('-'); // undo Advance() | |
327 } | |
328 // Return whether or not we skipped any characters. | |
329 return source_pos() != start_position; | |
330 } | |
331 } | |
332 | |
333 | |
334 Token::Value Scanner::SkipSingleLineComment() { | |
335 Advance(); | |
336 | |
337 // The line terminator at the end of the line is not considered | |
338 // to be part of the single-line comment; it is recognized | |
339 // separately by the lexical grammar and becomes part of the | |
340 // stream of input elements for the syntactic grammar (see | |
341 // ECMA-262, section 7.4, page 12). | |
342 while (c0_ >= 0 && !ScannerConstants::kIsLineTerminator.get(c0_)) { | |
343 Advance(); | |
344 } | |
345 | |
346 return Token::WHITESPACE; | |
347 } | |
348 | |
349 | |
350 Token::Value Scanner::SkipMultiLineComment() { | |
351 ASSERT(c0_ == '*'); | |
352 Advance(); | |
353 | |
354 while (c0_ >= 0) { | |
355 char ch = c0_; | |
356 Advance(); | |
357 // If we have reached the end of the multi-line comment, we | |
358 // consume the '/' and insert a whitespace. This way all | |
359 // multi-line comments are treated as whitespace - even the ones | |
360 // containing line terminators. This contradicts ECMA-262, section | |
361 // 7.4, page 12, that says that multi-line comments containing | |
362 // line terminators should be treated as a line terminator, but it | |
363 // matches the behaviour of SpiderMonkey and KJS. | |
364 if (ch == '*' && c0_ == '/') { | |
365 c0_ = ' '; | |
366 return Token::WHITESPACE; | |
367 } | |
368 } | |
369 | |
370 // Unterminated multi-line comment. | |
371 return Token::ILLEGAL; | |
372 } | |
373 | |
374 | |
375 Token::Value Scanner::ScanHtmlComment() { | |
376 // Check for <!-- comments. | |
377 ASSERT(c0_ == '!'); | |
378 Advance(); | |
379 if (c0_ == '-') { | |
380 Advance(); | |
381 if (c0_ == '-') return SkipSingleLineComment(); | |
382 PushBack('-'); // undo Advance() | |
383 } | |
384 PushBack('!'); // undo Advance() | |
385 ASSERT(c0_ == '!'); | |
386 return Token::LT; | |
387 } | |
388 | |
389 | |
390 | |
391 void Scanner::ScanJson() { | |
392 next_.literal_chars = Vector<const char>(); | 253 next_.literal_chars = Vector<const char>(); |
393 Token::Value token; | 254 Token::Value token; |
394 has_line_terminator_before_next_ = false; | |
395 do { | 255 do { |
396 // Remember the position of the next token | 256 // Remember the position of the next token |
397 next_.location.beg_pos = source_pos(); | 257 next_.location.beg_pos = source_pos(); |
398 switch (c0_) { | 258 switch (c0_) { |
399 case '\t': | 259 case '\t': |
400 case '\r': | 260 case '\r': |
401 case '\n': | 261 case '\n': |
402 case ' ': | 262 case ' ': |
403 Advance(); | 263 Advance(); |
404 token = Token::WHITESPACE; | 264 token = Token::WHITESPACE; |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
461 token = Select(Token::ILLEGAL); | 321 token = Select(Token::ILLEGAL); |
462 } | 322 } |
463 } | 323 } |
464 } while (token == Token::WHITESPACE); | 324 } while (token == Token::WHITESPACE); |
465 | 325 |
466 next_.location.end_pos = source_pos(); | 326 next_.location.end_pos = source_pos(); |
467 next_.token = token; | 327 next_.token = token; |
468 } | 328 } |
469 | 329 |
470 | 330 |
471 Token::Value Scanner::ScanJsonString() { | 331 Token::Value JsonScanner::ScanJsonString() { |
472 ASSERT_EQ('"', c0_); | 332 ASSERT_EQ('"', c0_); |
473 Advance(); | 333 Advance(); |
474 LiteralScope literal(this); | 334 LiteralScope literal(this); |
475 while (c0_ != '"' && c0_ > 0) { | 335 while (c0_ != '"' && c0_ > 0) { |
476 // Check for control character (0x00-0x1f) or unterminated string (<0). | 336 // Check for control character (0x00-0x1f) or unterminated string (<0). |
477 if (c0_ < 0x20) return Token::ILLEGAL; | 337 if (c0_ < 0x20) return Token::ILLEGAL; |
478 if (c0_ != '\\') { | 338 if (c0_ != '\\') { |
479 AddLiteralCharAdvance(); | 339 AddLiteralCharAdvance(); |
480 } else { | 340 } else { |
481 Advance(); | 341 Advance(); |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
521 } | 381 } |
522 if (c0_ != '"') { | 382 if (c0_ != '"') { |
523 return Token::ILLEGAL; | 383 return Token::ILLEGAL; |
524 } | 384 } |
525 literal.Complete(); | 385 literal.Complete(); |
526 Advance(); | 386 Advance(); |
527 return Token::STRING; | 387 return Token::STRING; |
528 } | 388 } |
529 | 389 |
530 | 390 |
531 Token::Value Scanner::ScanJsonNumber() { | 391 Token::Value JsonScanner::ScanJsonNumber() { |
532 LiteralScope literal(this); | 392 LiteralScope literal(this); |
533 if (c0_ == '-') AddLiteralCharAdvance(); | 393 if (c0_ == '-') AddLiteralCharAdvance(); |
534 if (c0_ == '0') { | 394 if (c0_ == '0') { |
535 AddLiteralCharAdvance(); | 395 AddLiteralCharAdvance(); |
536 // Prefix zero is only allowed if it's the only digit before | 396 // Prefix zero is only allowed if it's the only digit before |
537 // a decimal point or exponent. | 397 // a decimal point or exponent. |
538 if ('0' <= c0_ && c0_ <= '9') return Token::ILLEGAL; | 398 if ('0' <= c0_ && c0_ <= '9') return Token::ILLEGAL; |
539 } else { | 399 } else { |
540 if (c0_ < '1' || c0_ > '9') return Token::ILLEGAL; | 400 if (c0_ < '1' || c0_ > '9') return Token::ILLEGAL; |
541 do { | 401 do { |
(...skipping 13 matching lines...) Expand all Loading... |
555 if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL; | 415 if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL; |
556 do { | 416 do { |
557 AddLiteralCharAdvance(); | 417 AddLiteralCharAdvance(); |
558 } while (c0_ >= '0' && c0_ <= '9'); | 418 } while (c0_ >= '0' && c0_ <= '9'); |
559 } | 419 } |
560 literal.Complete(); | 420 literal.Complete(); |
561 return Token::NUMBER; | 421 return Token::NUMBER; |
562 } | 422 } |
563 | 423 |
564 | 424 |
565 Token::Value Scanner::ScanJsonIdentifier(const char* text, | 425 Token::Value JsonScanner::ScanJsonIdentifier(const char* text, |
566 Token::Value token) { | 426 Token::Value token) { |
567 LiteralScope literal(this); | 427 LiteralScope literal(this); |
568 while (*text != '\0') { | 428 while (*text != '\0') { |
569 if (c0_ != *text) return Token::ILLEGAL; | 429 if (c0_ != *text) return Token::ILLEGAL; |
570 Advance(); | 430 Advance(); |
571 text++; | 431 text++; |
572 } | 432 } |
573 if (ScannerConstants::kIsIdentifierPart.get(c0_)) return Token::ILLEGAL; | 433 if (ScannerConstants::kIsIdentifierPart.get(c0_)) return Token::ILLEGAL; |
574 literal.Complete(); | 434 literal.Complete(); |
575 return token; | 435 return token; |
576 } | 436 } |
577 | 437 |
578 | 438 |
579 void Scanner::ScanJavaScript() { | |
580 next_.literal_chars = Vector<const char>(); | |
581 Token::Value token; | |
582 do { | |
583 // Remember the position of the next token | |
584 next_.location.beg_pos = source_pos(); | |
585 | |
586 switch (c0_) { | |
587 case ' ': | |
588 case '\t': | |
589 Advance(); | |
590 token = Token::WHITESPACE; | |
591 break; | |
592 | |
593 case '\n': | |
594 Advance(); | |
595 has_line_terminator_before_next_ = true; | |
596 token = Token::WHITESPACE; | |
597 break; | |
598 | |
599 case '"': case '\'': | |
600 token = ScanString(); | |
601 break; | |
602 | |
603 case '<': | |
604 // < <= << <<= <!-- | |
605 Advance(); | |
606 if (c0_ == '=') { | |
607 token = Select(Token::LTE); | |
608 } else if (c0_ == '<') { | |
609 token = Select('=', Token::ASSIGN_SHL, Token::SHL); | |
610 } else if (c0_ == '!') { | |
611 token = ScanHtmlComment(); | |
612 } else { | |
613 token = Token::LT; | |
614 } | |
615 break; | |
616 | |
617 case '>': | |
618 // > >= >> >>= >>> >>>= | |
619 Advance(); | |
620 if (c0_ == '=') { | |
621 token = Select(Token::GTE); | |
622 } else if (c0_ == '>') { | |
623 // >> >>= >>> >>>= | |
624 Advance(); | |
625 if (c0_ == '=') { | |
626 token = Select(Token::ASSIGN_SAR); | |
627 } else if (c0_ == '>') { | |
628 token = Select('=', Token::ASSIGN_SHR, Token::SHR); | |
629 } else { | |
630 token = Token::SAR; | |
631 } | |
632 } else { | |
633 token = Token::GT; | |
634 } | |
635 break; | |
636 | |
637 case '=': | |
638 // = == === | |
639 Advance(); | |
640 if (c0_ == '=') { | |
641 token = Select('=', Token::EQ_STRICT, Token::EQ); | |
642 } else { | |
643 token = Token::ASSIGN; | |
644 } | |
645 break; | |
646 | |
647 case '!': | |
648 // ! != !== | |
649 Advance(); | |
650 if (c0_ == '=') { | |
651 token = Select('=', Token::NE_STRICT, Token::NE); | |
652 } else { | |
653 token = Token::NOT; | |
654 } | |
655 break; | |
656 | |
657 case '+': | |
658 // + ++ += | |
659 Advance(); | |
660 if (c0_ == '+') { | |
661 token = Select(Token::INC); | |
662 } else if (c0_ == '=') { | |
663 token = Select(Token::ASSIGN_ADD); | |
664 } else { | |
665 token = Token::ADD; | |
666 } | |
667 break; | |
668 | |
669 case '-': | |
670 // - -- --> -= | |
671 Advance(); | |
672 if (c0_ == '-') { | |
673 Advance(); | |
674 if (c0_ == '>' && has_line_terminator_before_next_) { | |
675 // For compatibility with SpiderMonkey, we skip lines that | |
676 // start with an HTML comment end '-->'. | |
677 token = SkipSingleLineComment(); | |
678 } else { | |
679 token = Token::DEC; | |
680 } | |
681 } else if (c0_ == '=') { | |
682 token = Select(Token::ASSIGN_SUB); | |
683 } else { | |
684 token = Token::SUB; | |
685 } | |
686 break; | |
687 | |
688 case '*': | |
689 // * *= | |
690 token = Select('=', Token::ASSIGN_MUL, Token::MUL); | |
691 break; | |
692 | |
693 case '%': | |
694 // % %= | |
695 token = Select('=', Token::ASSIGN_MOD, Token::MOD); | |
696 break; | |
697 | |
698 case '/': | |
699 // / // /* /= | |
700 Advance(); | |
701 if (c0_ == '/') { | |
702 token = SkipSingleLineComment(); | |
703 } else if (c0_ == '*') { | |
704 token = SkipMultiLineComment(); | |
705 } else if (c0_ == '=') { | |
706 token = Select(Token::ASSIGN_DIV); | |
707 } else { | |
708 token = Token::DIV; | |
709 } | |
710 break; | |
711 | |
712 case '&': | |
713 // & && &= | |
714 Advance(); | |
715 if (c0_ == '&') { | |
716 token = Select(Token::AND); | |
717 } else if (c0_ == '=') { | |
718 token = Select(Token::ASSIGN_BIT_AND); | |
719 } else { | |
720 token = Token::BIT_AND; | |
721 } | |
722 break; | |
723 | |
724 case '|': | |
725 // | || |= | |
726 Advance(); | |
727 if (c0_ == '|') { | |
728 token = Select(Token::OR); | |
729 } else if (c0_ == '=') { | |
730 token = Select(Token::ASSIGN_BIT_OR); | |
731 } else { | |
732 token = Token::BIT_OR; | |
733 } | |
734 break; | |
735 | |
736 case '^': | |
737 // ^ ^= | |
738 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); | |
739 break; | |
740 | |
741 case '.': | |
742 // . Number | |
743 Advance(); | |
744 if (IsDecimalDigit(c0_)) { | |
745 token = ScanNumber(true); | |
746 } else { | |
747 token = Token::PERIOD; | |
748 } | |
749 break; | |
750 | |
751 case ':': | |
752 token = Select(Token::COLON); | |
753 break; | |
754 | |
755 case ';': | |
756 token = Select(Token::SEMICOLON); | |
757 break; | |
758 | |
759 case ',': | |
760 token = Select(Token::COMMA); | |
761 break; | |
762 | |
763 case '(': | |
764 token = Select(Token::LPAREN); | |
765 break; | |
766 | |
767 case ')': | |
768 token = Select(Token::RPAREN); | |
769 break; | |
770 | |
771 case '[': | |
772 token = Select(Token::LBRACK); | |
773 break; | |
774 | |
775 case ']': | |
776 token = Select(Token::RBRACK); | |
777 break; | |
778 | |
779 case '{': | |
780 token = Select(Token::LBRACE); | |
781 break; | |
782 | |
783 case '}': | |
784 token = Select(Token::RBRACE); | |
785 break; | |
786 | |
787 case '?': | |
788 token = Select(Token::CONDITIONAL); | |
789 break; | |
790 | |
791 case '~': | |
792 token = Select(Token::BIT_NOT); | |
793 break; | |
794 | |
795 default: | |
796 if (ScannerConstants::kIsIdentifierStart.get(c0_)) { | |
797 token = ScanIdentifier(); | |
798 } else if (IsDecimalDigit(c0_)) { | |
799 token = ScanNumber(false); | |
800 } else if (SkipWhiteSpace()) { | |
801 token = Token::WHITESPACE; | |
802 } else if (c0_ < 0) { | |
803 token = Token::EOS; | |
804 } else { | |
805 token = Select(Token::ILLEGAL); | |
806 } | |
807 break; | |
808 } | |
809 | |
810 // Continue scanning for tokens as long as we're just skipping | |
811 // whitespace. | |
812 } while (token == Token::WHITESPACE); | |
813 | |
814 next_.location.end_pos = source_pos(); | |
815 next_.token = token; | |
816 } | |
817 | |
818 | |
819 void Scanner::SeekForward(int pos) { | |
820 source_->SeekForward(pos - 1); | |
821 Advance(); | |
822 // This function is only called to seek to the location | |
823 // of the end of a function (at the "}" token). It doesn't matter | |
824 // whether there was a line terminator in the part we skip. | |
825 has_line_terminator_before_next_ = false; | |
826 Scan(); | |
827 } | |
828 | |
829 | |
830 uc32 Scanner::ScanHexEscape(uc32 c, int length) { | |
831 ASSERT(length <= 4); // prevent overflow | |
832 | |
833 uc32 digits[4]; | |
834 uc32 x = 0; | |
835 for (int i = 0; i < length; i++) { | |
836 digits[i] = c0_; | |
837 int d = HexValue(c0_); | |
838 if (d < 0) { | |
839 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes | |
840 // should be illegal, but other JS VMs just return the | |
841 // non-escaped version of the original character. | |
842 | |
843 // Push back digits read, except the last one (in c0_). | |
844 for (int j = i-1; j >= 0; j--) { | |
845 PushBack(digits[j]); | |
846 } | |
847 // Notice: No handling of error - treat it as "\u"->"u". | |
848 return c; | |
849 } | |
850 x = x * 16 + d; | |
851 Advance(); | |
852 } | |
853 | |
854 return x; | |
855 } | |
856 | |
857 | |
858 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of | |
859 // ECMA-262. Other JS VMs support them. | |
860 uc32 Scanner::ScanOctalEscape(uc32 c, int length) { | |
861 uc32 x = c - '0'; | |
862 for (int i = 0; i < length; i++) { | |
863 int d = c0_ - '0'; | |
864 if (d < 0 || d > 7) break; | |
865 int nx = x * 8 + d; | |
866 if (nx >= 256) break; | |
867 x = nx; | |
868 Advance(); | |
869 } | |
870 return x; | |
871 } | |
872 | |
873 | |
874 void Scanner::ScanEscape() { | |
875 uc32 c = c0_; | |
876 Advance(); | |
877 | |
878 // Skip escaped newlines. | |
879 if (ScannerConstants::kIsLineTerminator.get(c)) { | |
880 // Allow CR+LF newlines in multiline string literals. | |
881 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance(); | |
882 // Allow LF+CR newlines in multiline string literals. | |
883 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance(); | |
884 return; | |
885 } | |
886 | |
887 switch (c) { | |
888 case '\'': // fall through | |
889 case '"' : // fall through | |
890 case '\\': break; | |
891 case 'b' : c = '\b'; break; | |
892 case 'f' : c = '\f'; break; | |
893 case 'n' : c = '\n'; break; | |
894 case 'r' : c = '\r'; break; | |
895 case 't' : c = '\t'; break; | |
896 case 'u' : c = ScanHexEscape(c, 4); break; | |
897 case 'v' : c = '\v'; break; | |
898 case 'x' : c = ScanHexEscape(c, 2); break; | |
899 case '0' : // fall through | |
900 case '1' : // fall through | |
901 case '2' : // fall through | |
902 case '3' : // fall through | |
903 case '4' : // fall through | |
904 case '5' : // fall through | |
905 case '6' : // fall through | |
906 case '7' : c = ScanOctalEscape(c, 2); break; | |
907 } | |
908 | |
909 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these | |
910 // should be illegal, but they are commonly handled | |
911 // as non-escaped characters by JS VMs. | |
912 AddLiteralChar(c); | |
913 } | |
914 | |
915 | |
916 Token::Value Scanner::ScanString() { | |
917 uc32 quote = c0_; | |
918 Advance(); // consume quote | |
919 | |
920 LiteralScope literal(this); | |
921 while (c0_ != quote && c0_ >= 0 | |
922 && !ScannerConstants::kIsLineTerminator.get(c0_)) { | |
923 uc32 c = c0_; | |
924 Advance(); | |
925 if (c == '\\') { | |
926 if (c0_ < 0) return Token::ILLEGAL; | |
927 ScanEscape(); | |
928 } else { | |
929 AddLiteralChar(c); | |
930 } | |
931 } | |
932 if (c0_ != quote) return Token::ILLEGAL; | |
933 literal.Complete(); | |
934 | |
935 Advance(); // consume quote | |
936 return Token::STRING; | |
937 } | |
938 | |
939 | |
940 Token::Value Scanner::Select(Token::Value tok) { | |
941 Advance(); | |
942 return tok; | |
943 } | |
944 | |
945 | |
946 Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) { | |
947 Advance(); | |
948 if (c0_ == next) { | |
949 Advance(); | |
950 return then; | |
951 } else { | |
952 return else_; | |
953 } | |
954 } | |
955 | |
956 | |
957 // Returns true if any decimal digits were scanned, returns false otherwise. | |
958 void Scanner::ScanDecimalDigits() { | |
959 while (IsDecimalDigit(c0_)) | |
960 AddLiteralCharAdvance(); | |
961 } | |
962 | |
963 | |
964 Token::Value Scanner::ScanNumber(bool seen_period) { | |
965 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction | |
966 | |
967 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL; | |
968 | |
969 LiteralScope literal(this); | |
970 if (seen_period) { | |
971 // we have already seen a decimal point of the float | |
972 AddLiteralChar('.'); | |
973 ScanDecimalDigits(); // we know we have at least one digit | |
974 | |
975 } else { | |
976 // if the first character is '0' we must check for octals and hex | |
977 if (c0_ == '0') { | |
978 AddLiteralCharAdvance(); | |
979 | |
980 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number | |
981 if (c0_ == 'x' || c0_ == 'X') { | |
982 // hex number | |
983 kind = HEX; | |
984 AddLiteralCharAdvance(); | |
985 if (!IsHexDigit(c0_)) { | |
986 // we must have at least one hex digit after 'x'/'X' | |
987 return Token::ILLEGAL; | |
988 } | |
989 while (IsHexDigit(c0_)) { | |
990 AddLiteralCharAdvance(); | |
991 } | |
992 } else if ('0' <= c0_ && c0_ <= '7') { | |
993 // (possible) octal number | |
994 kind = OCTAL; | |
995 while (true) { | |
996 if (c0_ == '8' || c0_ == '9') { | |
997 kind = DECIMAL; | |
998 break; | |
999 } | |
1000 if (c0_ < '0' || '7' < c0_) break; | |
1001 AddLiteralCharAdvance(); | |
1002 } | |
1003 } | |
1004 } | |
1005 | |
1006 // Parse decimal digits and allow trailing fractional part. | |
1007 if (kind == DECIMAL) { | |
1008 ScanDecimalDigits(); // optional | |
1009 if (c0_ == '.') { | |
1010 AddLiteralCharAdvance(); | |
1011 ScanDecimalDigits(); // optional | |
1012 } | |
1013 } | |
1014 } | |
1015 | |
1016 // scan exponent, if any | |
1017 if (c0_ == 'e' || c0_ == 'E') { | |
1018 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number | |
1019 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed | |
1020 // scan exponent | |
1021 AddLiteralCharAdvance(); | |
1022 if (c0_ == '+' || c0_ == '-') | |
1023 AddLiteralCharAdvance(); | |
1024 if (!IsDecimalDigit(c0_)) { | |
1025 // we must have at least one decimal digit after 'e'/'E' | |
1026 return Token::ILLEGAL; | |
1027 } | |
1028 ScanDecimalDigits(); | |
1029 } | |
1030 | |
1031 // The source character immediately following a numeric literal must | |
1032 // not be an identifier start or a decimal digit; see ECMA-262 | |
1033 // section 7.8.3, page 17 (note that we read only one decimal digit | |
1034 // if the value is 0). | |
1035 if (IsDecimalDigit(c0_) || ScannerConstants::kIsIdentifierStart.get(c0_)) | |
1036 return Token::ILLEGAL; | |
1037 | |
1038 literal.Complete(); | |
1039 | |
1040 return Token::NUMBER; | |
1041 } | |
1042 | |
1043 | |
1044 uc32 Scanner::ScanIdentifierUnicodeEscape() { | |
1045 Advance(); | |
1046 if (c0_ != 'u') return unibrow::Utf8::kBadChar; | |
1047 Advance(); | |
1048 uc32 c = ScanHexEscape('u', 4); | |
1049 // We do not allow a unicode escape sequence to start another | |
1050 // unicode escape sequence. | |
1051 if (c == '\\') return unibrow::Utf8::kBadChar; | |
1052 return c; | |
1053 } | |
1054 | |
1055 | |
1056 Token::Value Scanner::ScanIdentifier() { | |
1057 ASSERT(ScannerConstants::kIsIdentifierStart.get(c0_)); | |
1058 | |
1059 LiteralScope literal(this); | |
1060 KeywordMatcher keyword_match; | |
1061 | |
1062 // Scan identifier start character. | |
1063 if (c0_ == '\\') { | |
1064 uc32 c = ScanIdentifierUnicodeEscape(); | |
1065 // Only allow legal identifier start characters. | |
1066 if (!ScannerConstants::kIsIdentifierStart.get(c)) return Token::ILLEGAL; | |
1067 AddLiteralChar(c); | |
1068 keyword_match.Fail(); | |
1069 } else { | |
1070 AddLiteralChar(c0_); | |
1071 keyword_match.AddChar(c0_); | |
1072 Advance(); | |
1073 } | |
1074 | |
1075 // Scan the rest of the identifier characters. | |
1076 while (ScannerConstants::kIsIdentifierPart.get(c0_)) { | |
1077 if (c0_ == '\\') { | |
1078 uc32 c = ScanIdentifierUnicodeEscape(); | |
1079 // Only allow legal identifier part characters. | |
1080 if (!ScannerConstants::kIsIdentifierPart.get(c)) return Token::ILLEGAL; | |
1081 AddLiteralChar(c); | |
1082 keyword_match.Fail(); | |
1083 } else { | |
1084 AddLiteralChar(c0_); | |
1085 keyword_match.AddChar(c0_); | |
1086 Advance(); | |
1087 } | |
1088 } | |
1089 literal.Complete(); | |
1090 | |
1091 return keyword_match.token(); | |
1092 } | |
1093 | |
1094 | |
1095 | 439 |
1096 bool Scanner::ScanRegExpPattern(bool seen_equal) { | |
1097 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags | |
1098 bool in_character_class = false; | |
1099 | |
1100 // Previous token is either '/' or '/=', in the second case, the | |
1101 // pattern starts at =. | |
1102 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); | |
1103 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); | |
1104 | |
1105 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, | |
1106 // the scanner should pass uninterpreted bodies to the RegExp | |
1107 // constructor. | |
1108 LiteralScope literal(this); | |
1109 if (seen_equal) | |
1110 AddLiteralChar('='); | |
1111 | |
1112 while (c0_ != '/' || in_character_class) { | |
1113 if (ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) return false; | |
1114 if (c0_ == '\\') { // escaped character | |
1115 AddLiteralCharAdvance(); | |
1116 if (ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) return false; | |
1117 AddLiteralCharAdvance(); | |
1118 } else { // unescaped character | |
1119 if (c0_ == '[') in_character_class = true; | |
1120 if (c0_ == ']') in_character_class = false; | |
1121 AddLiteralCharAdvance(); | |
1122 } | |
1123 } | |
1124 Advance(); // consume '/' | |
1125 | |
1126 literal.Complete(); | |
1127 | |
1128 return true; | |
1129 } | |
1130 | |
1131 bool Scanner::ScanRegExpFlags() { | |
1132 // Scan regular expression flags. | |
1133 LiteralScope literal(this); | |
1134 while (ScannerConstants::kIsIdentifierPart.get(c0_)) { | |
1135 if (c0_ == '\\') { | |
1136 uc32 c = ScanIdentifierUnicodeEscape(); | |
1137 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) { | |
1138 // We allow any escaped character, unlike the restriction on | |
1139 // IdentifierPart when it is used to build an IdentifierName. | |
1140 AddLiteralChar(c); | |
1141 continue; | |
1142 } | |
1143 } | |
1144 AddLiteralCharAdvance(); | |
1145 } | |
1146 literal.Complete(); | |
1147 | |
1148 next_.location.end_pos = source_pos() - 1; | |
1149 return true; | |
1150 } | |
1151 | |
1152 } } // namespace v8::internal | 440 } } // namespace v8::internal |
OLD | NEW |