OLD | NEW |
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. | 1 // Copyright 2006-2008 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 136 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
147 // Incrementally recognize keywords. | 147 // Incrementally recognize keywords. |
148 // | 148 // |
149 // Recognized keywords: | 149 // Recognized keywords: |
150 // break case catch const* continue debugger* default delete do else | 150 // break case catch const* continue debugger* default delete do else |
151 // finally false for function if in instanceof native* new null | 151 // finally false for function if in instanceof native* new null |
152 // return switch this throw true try typeof var void while with | 152 // return switch this throw true try typeof var void while with |
153 // | 153 // |
154 // *: Actually "future reserved keywords". These are the only ones we | 154 // *: Actually "future reserved keywords". These are the only ones we |
155 // recognized, the remaining are allowed as identifiers. | 155 // recognized, the remaining are allowed as identifiers. |
156 public: | 156 public: |
157 KeywordMatcher() : state_(INITIAL), token_(Token::IDENTIFIER) {} | 157 KeywordMatcher() |
| 158 : state_(INITIAL), |
| 159 token_(Token::IDENTIFIER), |
| 160 keyword_(NULL), |
| 161 counter_(0), |
| 162 keyword_token_(Token::ILLEGAL) {} |
158 | 163 |
159 Token::Value token() { return token_; } | 164 Token::Value token() { return token_; } |
160 | 165 |
161 inline void AddChar(uc32 input) { | 166 inline void AddChar(uc32 input) { |
162 if (state_ != UNMATCHABLE) { | 167 if (state_ != UNMATCHABLE) { |
163 Step(input); | 168 Step(input); |
164 } | 169 } |
165 } | 170 } |
166 | 171 |
167 void Fail() { | 172 void Fail() { |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
199 }; | 204 }; |
200 | 205 |
201 // Range of possible first characters of a keyword. | 206 // Range of possible first characters of a keyword. |
202 static const unsigned int kFirstCharRangeMin = 'b'; | 207 static const unsigned int kFirstCharRangeMin = 'b'; |
203 static const unsigned int kFirstCharRangeMax = 'w'; | 208 static const unsigned int kFirstCharRangeMax = 'w'; |
204 static const unsigned int kFirstCharRangeLength = | 209 static const unsigned int kFirstCharRangeLength = |
205 kFirstCharRangeMax - kFirstCharRangeMin + 1; | 210 kFirstCharRangeMax - kFirstCharRangeMin + 1; |
206 // State map for first keyword character range. | 211 // State map for first keyword character range. |
207 static FirstState first_states_[kFirstCharRangeLength]; | 212 static FirstState first_states_[kFirstCharRangeLength]; |
208 | 213 |
209 // Current state. | |
210 State state_; | |
211 // Token for currently added characters. | |
212 Token::Value token_; | |
213 | |
214 // Matching a specific keyword string (there is only one possible valid | |
215 // keyword with the current prefix). | |
216 const char* keyword_; | |
217 int counter_; | |
218 Token::Value keyword_token_; | |
219 | |
220 // If input equals keyword's character at position, continue matching keyword | 214 // If input equals keyword's character at position, continue matching keyword |
221 // from that position. | 215 // from that position. |
222 inline bool MatchKeywordStart(uc32 input, | 216 inline bool MatchKeywordStart(uc32 input, |
223 const char* keyword, | 217 const char* keyword, |
224 int position, | 218 int position, |
225 Token::Value token_if_match) { | 219 Token::Value token_if_match) { |
226 if (input == keyword[position]) { | 220 if (input == keyword[position]) { |
227 state_ = KEYWORD_PREFIX; | 221 state_ = KEYWORD_PREFIX; |
228 this->keyword_ = keyword; | 222 this->keyword_ = keyword; |
229 this->counter_ = position + 1; | 223 this->counter_ = position + 1; |
230 this->keyword_token_ = token_if_match; | 224 this->keyword_token_ = token_if_match; |
231 return true; | 225 return true; |
232 } | 226 } |
233 return false; | 227 return false; |
234 } | 228 } |
235 | 229 |
236 // If input equals match character, transition to new state and return true. | 230 // If input equals match character, transition to new state and return true. |
237 inline bool MatchState(uc32 input, char match, State new_state) { | 231 inline bool MatchState(uc32 input, char match, State new_state) { |
238 if (input == match) { | 232 if (input == match) { |
239 state_ = new_state; | 233 state_ = new_state; |
240 return true; | 234 return true; |
241 } | 235 } |
242 return false; | 236 return false; |
243 } | 237 } |
244 | 238 |
245 inline bool MatchKeyword(uc32 input, | 239 inline bool MatchKeyword(uc32 input, |
246 char match, | 240 char match, |
247 State new_state, | 241 State new_state, |
248 Token::Value keyword_token) { | 242 Token::Value keyword_token) { |
249 if (input == match) { // Matched "do". | 243 if (input != match) { |
250 state_ = new_state; | 244 return false; |
251 token_ = keyword_token; | |
252 return true; | |
253 } | 245 } |
254 return false; | 246 state_ = new_state; |
| 247 token_ = keyword_token; |
| 248 return true; |
255 } | 249 } |
256 | 250 |
257 void Step(uc32 input); | 251 void Step(uc32 input); |
| 252 |
| 253 // Current state. |
| 254 State state_; |
| 255 // Token for currently added characters. |
| 256 Token::Value token_; |
| 257 |
| 258 // Matching a specific keyword string (there is only one possible valid |
| 259 // keyword with the current prefix). |
| 260 const char* keyword_; |
| 261 int counter_; |
| 262 Token::Value keyword_token_; |
258 }; | 263 }; |
259 | 264 |
260 | 265 |
261 enum ParserMode { PARSE, PREPARSE }; | 266 enum ParserMode { PARSE, PREPARSE }; |
262 enum ParserLanguage { JAVASCRIPT, JSON }; | 267 enum ParserLanguage { JAVASCRIPT, JSON }; |
263 | 268 |
264 | 269 |
265 class Scanner { | 270 class Scanner { |
266 public: | 271 public: |
267 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; | 272 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; |
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
355 | 360 |
356 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart; | 361 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart; |
357 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; | 362 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; |
358 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; | 363 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; |
359 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; | 364 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; |
360 | 365 |
361 static const int kCharacterLookaheadBufferSize = 1; | 366 static const int kCharacterLookaheadBufferSize = 1; |
362 static const int kNoEndPosition = 1; | 367 static const int kNoEndPosition = 1; |
363 | 368 |
364 private: | 369 private: |
365 void Init(Handle<String> source, | |
366 unibrow::CharacterStream* stream, | |
367 int start_position, int end_position, | |
368 ParserLanguage language); | |
369 | |
370 | |
371 // Different UTF16 buffers used to pull characters from. Based on input one of | |
372 // these will be initialized as the actual data source. | |
373 CharacterStreamUTF16Buffer char_stream_buffer_; | |
374 ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t> | |
375 two_byte_string_buffer_; | |
376 ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_; | |
377 | |
378 // Source. Will point to one of the buffers declared above. | |
379 UTF16Buffer* source_; | |
380 | |
381 // Used to convert the source string into a character stream when a stream | |
382 // is not passed to the scanner. | |
383 SafeStringInputBuffer safe_string_input_buffer_; | |
384 | |
385 // Buffer to hold literal values (identifiers, strings, numbers) | |
386 // using 0-terminated UTF-8 encoding. | |
387 UTF8Buffer literal_buffer_1_; | |
388 UTF8Buffer literal_buffer_2_; | |
389 | |
390 bool stack_overflow_; | |
391 static StaticResource<Utf8Decoder> utf8_decoder_; | |
392 | |
393 // One Unicode character look-ahead; c0_ < 0 at the end of the input. | |
394 uc32 c0_; | |
395 | |
396 // The current and look-ahead token. | 370 // The current and look-ahead token. |
397 struct TokenDesc { | 371 struct TokenDesc { |
398 Token::Value token; | 372 Token::Value token; |
399 Location location; | 373 Location location; |
400 UTF8Buffer* literal_buffer; | 374 UTF8Buffer* literal_buffer; |
401 }; | 375 }; |
402 | 376 |
403 TokenDesc current_; // desc for current token (as returned by Next()) | 377 void Init(Handle<String> source, |
404 TokenDesc next_; // desc for next token (one token look-ahead) | 378 unibrow::CharacterStream* stream, |
405 bool has_line_terminator_before_next_; | 379 int start_position, int end_position, |
406 bool is_pre_parsing_; | 380 ParserLanguage language); |
407 bool is_parsing_json_; | |
408 | 381 |
409 // Literal buffer support | 382 // Literal buffer support |
410 void StartLiteral(); | 383 void StartLiteral(); |
411 void AddChar(uc32 ch); | 384 void AddChar(uc32 ch); |
412 void AddCharAdvance(); | 385 void AddCharAdvance(); |
413 void TerminateLiteral(); | 386 void TerminateLiteral(); |
414 | 387 |
415 // Low-level scanning support. | 388 // Low-level scanning support. |
416 void Advance() { c0_ = source_->Advance(); } | 389 void Advance() { c0_ = source_->Advance(); } |
417 void PushBack(uc32 ch) { | 390 void PushBack(uc32 ch) { |
418 source_->PushBack(ch); | 391 source_->PushBack(ch); |
419 c0_ = ch; | 392 c0_ = ch; |
420 } | 393 } |
421 | 394 |
422 bool SkipWhiteSpace() { | 395 bool SkipWhiteSpace() { |
423 if (is_parsing_json_) { | 396 if (is_parsing_json_) { |
424 return SkipJsonWhiteSpace(); | 397 return SkipJsonWhiteSpace(); |
425 } else { | 398 } else { |
426 return SkipJavaScriptWhiteSpace(); | 399 return SkipJavaScriptWhiteSpace(); |
427 } | 400 } |
428 } | 401 } |
| 402 |
429 bool SkipJavaScriptWhiteSpace(); | 403 bool SkipJavaScriptWhiteSpace(); |
430 bool SkipJsonWhiteSpace(); | 404 bool SkipJsonWhiteSpace(); |
431 Token::Value SkipSingleLineComment(); | 405 Token::Value SkipSingleLineComment(); |
432 Token::Value SkipMultiLineComment(); | 406 Token::Value SkipMultiLineComment(); |
433 | 407 |
434 inline Token::Value Select(Token::Value tok); | 408 inline Token::Value Select(Token::Value tok); |
435 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_); | 409 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_); |
436 | 410 |
437 inline void Scan() { | 411 inline void Scan() { |
438 if (is_parsing_json_) { | 412 if (is_parsing_json_) { |
(...skipping 14 matching lines...) Expand all Loading... |
453 // carrige-return, newline and space. | 427 // carrige-return, newline and space. |
454 void ScanJson(); | 428 void ScanJson(); |
455 | 429 |
456 // A JSON number (production JSONNumber) is a subset of the valid JavaScript | 430 // A JSON number (production JSONNumber) is a subset of the valid JavaScript |
457 // decimal number literals. | 431 // decimal number literals. |
458 // It includes an optional minus sign, must have at least one | 432 // It includes an optional minus sign, must have at least one |
459 // digit before and after a decimal point, may not have prefixed zeros (unless | 433 // digit before and after a decimal point, may not have prefixed zeros (unless |
460 // the integer part is zero), and may include an exponent part (e.g., "e-10"). | 434 // the integer part is zero), and may include an exponent part (e.g., "e-10"). |
461 // Hexadecimal and octal numbers are not allowed. | 435 // Hexadecimal and octal numbers are not allowed. |
462 Token::Value ScanJsonNumber(); | 436 Token::Value ScanJsonNumber(); |
| 437 |
463 // A JSON string (production JSONString) is subset of valid JavaScript string | 438 // A JSON string (production JSONString) is subset of valid JavaScript string |
464 // literals. The string must only be double-quoted (not single-quoted), and | 439 // literals. The string must only be double-quoted (not single-quoted), and |
465 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and | 440 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and |
466 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid. | 441 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid. |
467 Token::Value ScanJsonString(); | 442 Token::Value ScanJsonString(); |
| 443 |
468 // Used to recognizes one of the literals "true", "false", or "null". These | 444 // Used to recognizes one of the literals "true", "false", or "null". These |
469 // are the only valid JSON identifiers (productions JSONBooleanLiteral, | 445 // are the only valid JSON identifiers (productions JSONBooleanLiteral, |
470 // JSONNullLiteral). | 446 // JSONNullLiteral). |
471 Token::Value ScanJsonIdentifier(const char* text, Token::Value token); | 447 Token::Value ScanJsonIdentifier(const char* text, Token::Value token); |
472 | 448 |
473 void ScanDecimalDigits(); | 449 void ScanDecimalDigits(); |
474 Token::Value ScanNumber(bool seen_period); | 450 Token::Value ScanNumber(bool seen_period); |
475 Token::Value ScanIdentifier(); | 451 Token::Value ScanIdentifier(); |
476 uc32 ScanHexEscape(uc32 c, int length); | 452 uc32 ScanHexEscape(uc32 c, int length); |
477 uc32 ScanOctalEscape(uc32 c, int length); | 453 uc32 ScanOctalEscape(uc32 c, int length); |
478 void ScanEscape(); | 454 void ScanEscape(); |
479 Token::Value ScanString(); | 455 Token::Value ScanString(); |
480 | 456 |
481 // Scans a possible HTML comment -- begins with '<!'. | 457 // Scans a possible HTML comment -- begins with '<!'. |
482 Token::Value ScanHtmlComment(); | 458 Token::Value ScanHtmlComment(); |
483 | 459 |
484 // Return the current source position. | 460 // Return the current source position. |
485 int source_pos() { | 461 int source_pos() { |
486 return source_->pos() - kCharacterLookaheadBufferSize; | 462 return source_->pos() - kCharacterLookaheadBufferSize; |
487 } | 463 } |
488 | 464 |
489 // Decodes a unicode escape-sequence which is part of an identifier. | 465 // Decodes a unicode escape-sequence which is part of an identifier. |
490 // If the escape sequence cannot be decoded the result is kBadRune. | 466 // If the escape sequence cannot be decoded the result is kBadRune. |
491 uc32 ScanIdentifierUnicodeEscape(); | 467 uc32 ScanIdentifierUnicodeEscape(); |
| 468 |
| 469 TokenDesc current_; // desc for current token (as returned by Next()) |
| 470 TokenDesc next_; // desc for next token (one token look-ahead) |
| 471 bool has_line_terminator_before_next_; |
| 472 bool is_pre_parsing_; |
| 473 bool is_parsing_json_; |
| 474 |
| 475 // Different UTF16 buffers used to pull characters from. Based on input one of |
| 476 // these will be initialized as the actual data source. |
| 477 CharacterStreamUTF16Buffer char_stream_buffer_; |
| 478 ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t> |
| 479 two_byte_string_buffer_; |
| 480 ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_; |
| 481 |
| 482 // Source. Will point to one of the buffers declared above. |
| 483 UTF16Buffer* source_; |
| 484 |
| 485 // Used to convert the source string into a character stream when a stream |
| 486 // is not passed to the scanner. |
| 487 SafeStringInputBuffer safe_string_input_buffer_; |
| 488 |
| 489 // Buffer to hold literal values (identifiers, strings, numbers) |
| 490 // using 0-terminated UTF-8 encoding. |
| 491 UTF8Buffer literal_buffer_1_; |
| 492 UTF8Buffer literal_buffer_2_; |
| 493 |
| 494 bool stack_overflow_; |
| 495 static StaticResource<Utf8Decoder> utf8_decoder_; |
| 496 |
| 497 // One Unicode character look-ahead; c0_ < 0 at the end of the input. |
| 498 uc32 c0_; |
492 }; | 499 }; |
493 | 500 |
494 } } // namespace v8::internal | 501 } } // namespace v8::internal |
495 | 502 |
496 #endif // V8_SCANNER_H_ | 503 #endif // V8_SCANNER_H_ |
OLD | NEW |