OLD | NEW |
1 // Copyright 2010 the V8 project authors. All rights reserved. | 1 // Copyright 2010 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 19 matching lines...) Expand all Loading... |
30 #ifndef V8_SCANNER_BASE_H_ | 30 #ifndef V8_SCANNER_BASE_H_ |
31 #define V8_SCANNER_BASE_H_ | 31 #define V8_SCANNER_BASE_H_ |
32 | 32 |
33 #include "globals.h" | 33 #include "globals.h" |
34 #include "checks.h" | 34 #include "checks.h" |
35 #include "allocation.h" | 35 #include "allocation.h" |
36 #include "token.h" | 36 #include "token.h" |
37 #include "unicode-inl.h" | 37 #include "unicode-inl.h" |
38 #include "char-predicates.h" | 38 #include "char-predicates.h" |
39 #include "utils.h" | 39 #include "utils.h" |
| 40 #include "list-inl.h" |
40 | 41 |
41 namespace v8 { | 42 namespace v8 { |
42 namespace internal { | 43 namespace internal { |
43 | 44 |
44 // Interface through which the scanner reads characters from the input source. | 45 // Returns the value (0 .. 15) of a hexadecimal character c. |
| 46 // If c is not a legal hexadecimal character, returns a value < 0. |
| 47 inline int HexValue(uc32 c) { |
| 48 c -= '0'; |
| 49 if (static_cast<unsigned>(c) <= 9) return c; |
| 50 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36. |
| 51 if (static_cast<unsigned>(c) <= 6) return c + 10; |
| 52 return -1; |
| 53 } |
| 54 |
| 55 // ---------------------------------------------------------------------------- |
| 56 // UTF16Buffer - scanner input source with pushback. |
| 57 |
45 class UTF16Buffer { | 58 class UTF16Buffer { |
46 public: | 59 public: |
47 UTF16Buffer(); | 60 UTF16Buffer(); |
48 virtual ~UTF16Buffer() {} | 61 virtual ~UTF16Buffer() {} |
49 | 62 |
50 virtual void PushBack(uc32 ch) = 0; | 63 virtual void PushBack(uc32 ch) = 0; |
51 // Returns a value < 0 when the buffer end is reached. | 64 // Returns a value < 0 when the buffer end is reached. |
52 virtual uc32 Advance() = 0; | 65 virtual uc32 Advance() = 0; |
53 virtual void SeekForward(int pos) = 0; | 66 virtual void SeekForward(int pos) = 0; |
54 | 67 |
55 int pos() const { return pos_; } | 68 int pos() const { return pos_; } |
56 | 69 |
| 70 static const int kNoEndPosition = 1; |
| 71 |
57 protected: | 72 protected: |
| 73 // Initial value of end_ before the input stream is initialized. |
| 74 |
58 int pos_; // Current position in the buffer. | 75 int pos_; // Current position in the buffer. |
59 int end_; // Position where scanning should stop (EOF). | 76 int end_; // Position where scanning should stop (EOF). |
60 }; | 77 }; |
61 | 78 |
62 | 79 |
63 class ScannerConstants : AllStatic { | 80 class ScannerConstants : AllStatic { |
64 public: | 81 public: |
65 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; | 82 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; |
66 | 83 |
67 static StaticResource<Utf8Decoder>* utf8_decoder() { | 84 static StaticResource<Utf8Decoder>* utf8_decoder() { |
68 return &utf8_decoder_; | 85 return &utf8_decoder_; |
69 } | 86 } |
70 | 87 |
71 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart; | 88 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart; |
72 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; | 89 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; |
73 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; | 90 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; |
74 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; | 91 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; |
75 | 92 |
76 static bool IsIdentifier(unibrow::CharacterStream* buffer); | 93 static bool IsIdentifier(unibrow::CharacterStream* buffer); |
77 | 94 |
78 private: | 95 private: |
79 static StaticResource<Utf8Decoder> utf8_decoder_; | 96 static StaticResource<Utf8Decoder> utf8_decoder_; |
80 }; | 97 }; |
81 | 98 |
| 99 // ---------------------------------------------------------------------------- |
| 100 // LiteralCollector - Collector of chars of literals. |
| 101 |
| 102 class LiteralCollector { |
| 103 public: |
| 104 LiteralCollector(); |
| 105 ~LiteralCollector(); |
| 106 |
| 107 inline void AddChar(uc32 c) { |
| 108 if (recording_) { |
| 109 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { |
| 110 buffer_.Add(static_cast<char>(c)); |
| 111 } else { |
| 112 AddCharSlow(c); |
| 113 } |
| 114 } |
| 115 } |
| 116 |
| 117 void StartLiteral() { |
| 118 buffer_.StartSequence(); |
| 119 recording_ = true; |
| 120 } |
| 121 |
| 122 Vector<const char> EndLiteral() { |
| 123 if (recording_) { |
| 124 recording_ = false; |
| 125 buffer_.Add(kEndMarker); |
| 126 Vector<char> sequence = buffer_.EndSequence(); |
| 127 return Vector<const char>(sequence.start(), sequence.length()); |
| 128 } |
| 129 return Vector<const char>(); |
| 130 } |
| 131 |
| 132 void DropLiteral() { |
| 133 if (recording_) { |
| 134 recording_ = false; |
| 135 buffer_.DropSequence(); |
| 136 } |
| 137 } |
| 138 |
| 139 void Reset() { |
| 140 buffer_.Reset(); |
| 141 } |
| 142 |
| 143 // The end marker added after a parsed literal. |
| 144 // Using zero allows the usage of strlen and similar functions on |
| 145 // identifiers and numbers (but not strings, since they may contain zero |
| 146 // bytes). |
| 147 static const char kEndMarker = '\x00'; |
| 148 private: |
| 149 static const int kInitialCapacity = 256; |
| 150 SequenceCollector<char, 4> buffer_; |
| 151 bool recording_; |
| 152 void AddCharSlow(uc32 c); |
| 153 }; |
| 154 |
| 155 // ---------------------------------------------------------------------------- |
| 156 // Scanner base-class. |
| 157 |
| 158 // Generic functionality used by both JSON and JavaScript scanners. |
| 159 class Scanner { |
| 160 public: |
| 161 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; |
| 162 |
| 163 class LiteralScope { |
| 164 public: |
| 165 explicit LiteralScope(Scanner* self); |
| 166 ~LiteralScope(); |
| 167 void Complete(); |
| 168 |
| 169 private: |
| 170 Scanner* scanner_; |
| 171 bool complete_; |
| 172 }; |
| 173 |
| 174 Scanner(); |
| 175 |
| 176 // Returns the current token again. |
| 177 Token::Value current_token() { return current_.token; } |
| 178 |
| 179 // One token look-ahead (past the token returned by Next()). |
| 180 Token::Value peek() const { return next_.token; } |
| 181 |
| 182 struct Location { |
| 183 Location(int b, int e) : beg_pos(b), end_pos(e) { } |
| 184 Location() : beg_pos(0), end_pos(0) { } |
| 185 int beg_pos; |
| 186 int end_pos; |
| 187 }; |
| 188 |
| 189 // Returns the location information for the current token |
| 190 // (the token returned by Next()). |
| 191 Location location() const { return current_.location; } |
| 192 Location peek_location() const { return next_.location; } |
| 193 |
| 194 // Returns the literal string, if any, for the current token (the |
| 195 // token returned by Next()). The string is 0-terminated and in |
| 196 // UTF-8 format; they may contain 0-characters. Literal strings are |
| 197 // collected for identifiers, strings, and numbers. |
| 198 // These functions only give the correct result if the literal |
| 199 // was scanned between calls to StartLiteral() and TerminateLiteral(). |
| 200 const char* literal_string() const { |
| 201 return current_.literal_chars.start(); |
| 202 } |
| 203 |
| 204 int literal_length() const { |
| 205 // Excluding terminal '\x00' added by TerminateLiteral(). |
| 206 return current_.literal_chars.length() - 1; |
| 207 } |
| 208 |
| 209 Vector<const char> literal() const { |
| 210 return Vector<const char>(literal_string(), literal_length()); |
| 211 } |
| 212 |
| 213 // Returns the literal string for the next token (the token that |
| 214 // would be returned if Next() were called). |
| 215 const char* next_literal_string() const { |
| 216 return next_.literal_chars.start(); |
| 217 } |
| 218 |
| 219 |
| 220 // Returns the length of the next token (that would be returned if |
| 221 // Next() were called). |
| 222 int next_literal_length() const { |
| 223 // Excluding terminal '\x00' added by TerminateLiteral(). |
| 224 return next_.literal_chars.length() - 1; |
| 225 } |
| 226 |
| 227 Vector<const char> next_literal() const { |
| 228 return Vector<const char>(next_literal_string(), next_literal_length()); |
| 229 } |
| 230 |
| 231 bool stack_overflow() { return stack_overflow_; } |
| 232 |
| 233 static const int kCharacterLookaheadBufferSize = 1; |
| 234 |
| 235 protected: |
| 236 // The current and look-ahead token. |
| 237 struct TokenDesc { |
| 238 Token::Value token; |
| 239 Location location; |
| 240 Vector<const char> literal_chars; |
| 241 }; |
| 242 |
| 243 // Call this after setting source_ to the input. |
| 244 void Init() { |
| 245 // Set c0_ (one character ahead) |
| 246 ASSERT(kCharacterLookaheadBufferSize == 1); |
| 247 Advance(); |
| 248 // Initialize current_ to not refer to a literal. |
| 249 current_.literal_chars = Vector<const char>(); |
| 250 // Reset literal buffer. |
| 251 literal_buffer_.Reset(); |
| 252 } |
| 253 |
| 254 // Literal buffer support |
| 255 inline void StartLiteral() { |
| 256 literal_buffer_.StartLiteral(); |
| 257 } |
| 258 |
| 259 inline void AddLiteralChar(uc32 c) { |
| 260 literal_buffer_.AddChar(c); |
| 261 } |
| 262 |
| 263 // Complete scanning of a literal. |
| 264 inline void TerminateLiteral() { |
| 265 next_.literal_chars = literal_buffer_.EndLiteral(); |
| 266 } |
| 267 |
| 268 // Stops scanning of a literal and drop the collected characters, |
| 269 // e.g., due to an encountered error. |
| 270 inline void DropLiteral() { |
| 271 literal_buffer_.DropLiteral(); |
| 272 } |
| 273 |
| 274 inline void AddLiteralCharAdvance() { |
| 275 AddLiteralChar(c0_); |
| 276 Advance(); |
| 277 } |
| 278 |
| 279 // Low-level scanning support. |
| 280 void Advance() { c0_ = source_->Advance(); } |
| 281 void PushBack(uc32 ch) { |
| 282 source_->PushBack(ch); |
| 283 c0_ = ch; |
| 284 } |
| 285 |
| 286 inline Token::Value Select(Token::Value tok) { |
| 287 Advance(); |
| 288 return tok; |
| 289 } |
| 290 |
| 291 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) { |
| 292 Advance(); |
| 293 if (c0_ == next) { |
| 294 Advance(); |
| 295 return then; |
| 296 } else { |
| 297 return else_; |
| 298 } |
| 299 } |
| 300 |
| 301 uc32 ScanHexEscape(uc32 c, int length); |
| 302 uc32 ScanOctalEscape(uc32 c, int length); |
| 303 |
| 304 // Return the current source position. |
| 305 int source_pos() { |
| 306 return source_->pos() - kCharacterLookaheadBufferSize; |
| 307 } |
| 308 |
| 309 TokenDesc current_; // desc for current token (as returned by Next()) |
| 310 TokenDesc next_; // desc for next token (one token look-ahead) |
| 311 |
| 312 // Input stream. Must be initialized to an UTF16Buffer. |
| 313 UTF16Buffer* source_; |
| 314 |
| 315 // Buffer to hold literal values (identifiers, strings, numbers) |
| 316 // using '\x00'-terminated UTF-8 encoding. Handles allocation internally. |
| 317 LiteralCollector literal_buffer_; |
| 318 |
| 319 bool stack_overflow_; |
| 320 |
| 321 // One Unicode character look-ahead; c0_ < 0 at the end of the input. |
| 322 uc32 c0_; |
| 323 }; |
| 324 |
| 325 // ---------------------------------------------------------------------------- |
| 326 // JavaScriptScanner - base logic for JavaScript scanning. |
| 327 |
| 328 class JavaScriptScanner : public Scanner { |
| 329 public: |
| 330 JavaScriptScanner(); |
| 331 |
| 332 // Returns the next token. |
| 333 Token::Value Next(); |
| 334 |
| 335 // Returns true if there was a line terminator before the peek'ed token. |
| 336 bool has_line_terminator_before_next() const { |
| 337 return has_line_terminator_before_next_; |
| 338 } |
| 339 |
| 340 // Scans the input as a regular expression pattern, previous |
| 341 // character(s) must be /(=). Returns true if a pattern is scanned. |
| 342 bool ScanRegExpPattern(bool seen_equal); |
| 343 // Returns true if regexp flags are scanned (always since flags can |
| 344 // be empty). |
| 345 bool ScanRegExpFlags(); |
| 346 |
| 347 // Tells whether the buffer contains an identifier (no escapes). |
| 348 // Used for checking if a property name is an identifier. |
| 349 static bool IsIdentifier(unibrow::CharacterStream* buffer); |
| 350 |
| 351 // Seek forward to the given position. This operation does not |
| 352 // work in general, for instance when there are pushed back |
| 353 // characters, but works for seeking forward until simple delimiter |
| 354 // tokens, which is what it is used for. |
| 355 void SeekForward(int pos); |
| 356 |
| 357 protected: |
| 358 bool SkipWhiteSpace(); |
| 359 Token::Value SkipSingleLineComment(); |
| 360 Token::Value SkipMultiLineComment(); |
| 361 |
| 362 // Scans a single JavaScript token. |
| 363 void Scan(); |
| 364 |
| 365 void ScanDecimalDigits(); |
| 366 Token::Value ScanNumber(bool seen_period); |
| 367 Token::Value ScanIdentifier(); |
| 368 |
| 369 void ScanEscape(); |
| 370 Token::Value ScanString(); |
| 371 |
| 372 // Scans a possible HTML comment -- begins with '<!'. |
| 373 Token::Value ScanHtmlComment(); |
| 374 |
| 375 // Decodes a unicode escape-sequence which is part of an identifier. |
| 376 // If the escape sequence cannot be decoded the result is kBadChar. |
| 377 uc32 ScanIdentifierUnicodeEscape(); |
| 378 |
| 379 bool has_line_terminator_before_next_; |
| 380 }; |
| 381 |
| 382 |
| 383 // ---------------------------------------------------------------------------- |
| 384 // Keyword matching state machine. |
82 | 385 |
83 class KeywordMatcher { | 386 class KeywordMatcher { |
84 // Incrementally recognize keywords. | 387 // Incrementally recognize keywords. |
85 // | 388 // |
86 // Recognized keywords: | 389 // Recognized keywords: |
87 // break case catch const* continue debugger* default delete do else | 390 // break case catch const* continue debugger* default delete do else |
88 // finally false for function if in instanceof native* new null | 391 // finally false for function if in instanceof native* new null |
89 // return switch this throw true try typeof var void while with | 392 // return switch this throw true try typeof var void while with |
90 // | 393 // |
91 // *: Actually "future reserved keywords". These are the only ones we | 394 // *: Actually "future reserved keywords". These are the only ones we |
(...skipping 105 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
197 // keyword with the current prefix). | 500 // keyword with the current prefix). |
198 const char* keyword_; | 501 const char* keyword_; |
199 int counter_; | 502 int counter_; |
200 Token::Value keyword_token_; | 503 Token::Value keyword_token_; |
201 }; | 504 }; |
202 | 505 |
203 | 506 |
204 } } // namespace v8::internal | 507 } } // namespace v8::internal |
205 | 508 |
206 #endif // V8_SCANNER_BASE_H_ | 509 #endif // V8_SCANNER_BASE_H_ |
OLD | NEW |