| OLD | NEW |
| 1 // Copyright 2010 the V8 project authors. All rights reserved. | 1 // Copyright 2010 the V8 project authors. All rights reserved. |
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
| 4 // met: | 4 // met: |
| 5 // | 5 // |
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
| (...skipping 17 matching lines...) Expand all Loading... |
| 28 #ifndef V8_SCANNER_H_ | 28 #ifndef V8_SCANNER_H_ |
| 29 #define V8_SCANNER_H_ | 29 #define V8_SCANNER_H_ |
| 30 | 30 |
| 31 #include "token.h" | 31 #include "token.h" |
| 32 #include "char-predicates-inl.h" | 32 #include "char-predicates-inl.h" |
| 33 #include "scanner-base.h" | 33 #include "scanner-base.h" |
| 34 | 34 |
| 35 namespace v8 { | 35 namespace v8 { |
| 36 namespace internal { | 36 namespace internal { |
| 37 | 37 |
| 38 | |
| 39 class UTF8Buffer { | |
| 40 public: | |
| 41 UTF8Buffer(); | |
| 42 ~UTF8Buffer(); | |
| 43 | |
| 44 inline void AddChar(uc32 c) { | |
| 45 if (recording_) { | |
| 46 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { | |
| 47 buffer_.Add(static_cast<char>(c)); | |
| 48 } else { | |
| 49 AddCharSlow(c); | |
| 50 } | |
| 51 } | |
| 52 } | |
| 53 | |
| 54 void StartLiteral() { | |
| 55 buffer_.StartSequence(); | |
| 56 recording_ = true; | |
| 57 } | |
| 58 | |
| 59 Vector<const char> EndLiteral() { | |
| 60 if (recording_) { | |
| 61 recording_ = false; | |
| 62 buffer_.Add(kEndMarker); | |
| 63 Vector<char> sequence = buffer_.EndSequence(); | |
| 64 return Vector<const char>(sequence.start(), sequence.length()); | |
| 65 } | |
| 66 return Vector<const char>(); | |
| 67 } | |
| 68 | |
| 69 void DropLiteral() { | |
| 70 if (recording_) { | |
| 71 recording_ = false; | |
| 72 buffer_.DropSequence(); | |
| 73 } | |
| 74 } | |
| 75 | |
| 76 void Reset() { | |
| 77 buffer_.Reset(); | |
| 78 } | |
| 79 | |
| 80 // The end marker added after a parsed literal. | |
| 81 // Using zero allows the usage of strlen and similar functions on | |
| 82 // identifiers and numbers (but not strings, since they may contain zero | |
| 83 // bytes). | |
| 84 // TODO(lrn): Use '\xff' as end marker, since it cannot occur inside | |
| 85 // an utf-8 string. This requires changes in all places that uses | |
| 86 // str-functions on the literals, but allows a single pointer to represent | |
| 87 // the literal, even if it contains embedded zeros. | |
| 88 static const char kEndMarker = '\x00'; | |
| 89 private: | |
| 90 static const int kInitialCapacity = 256; | |
| 91 SequenceCollector<char, 4> buffer_; | |
| 92 bool recording_; | |
| 93 void AddCharSlow(uc32 c); | |
| 94 }; | |
| 95 | |
| 96 | |
| 97 // UTF16 buffer to read characters from a character stream. | 38 // UTF16 buffer to read characters from a character stream. |
| 98 class CharacterStreamUTF16Buffer: public UTF16Buffer { | 39 class CharacterStreamUTF16Buffer: public UTF16Buffer { |
| 99 public: | 40 public: |
| 100 CharacterStreamUTF16Buffer(); | 41 CharacterStreamUTF16Buffer(); |
| 101 virtual ~CharacterStreamUTF16Buffer() {} | 42 virtual ~CharacterStreamUTF16Buffer() {} |
| 102 void Initialize(Handle<String> data, | 43 void Initialize(Handle<String> data, |
| 103 unibrow::CharacterStream* stream, | 44 unibrow::CharacterStream* stream, |
| 104 int start_position, | 45 int start_position, |
| 105 int end_position); | 46 int end_position); |
| 106 virtual void PushBack(uc32 ch); | 47 virtual void PushBack(uc32 ch); |
| (...skipping 20 matching lines...) Expand all Loading... |
| 127 int end_position); | 68 int end_position); |
| 128 virtual void PushBack(uc32 ch); | 69 virtual void PushBack(uc32 ch); |
| 129 virtual uc32 Advance(); | 70 virtual uc32 Advance(); |
| 130 virtual void SeekForward(int pos); | 71 virtual void SeekForward(int pos); |
| 131 | 72 |
| 132 private: | 73 private: |
| 133 const CharType* raw_data_; // Pointer to the actual array of characters. | 74 const CharType* raw_data_; // Pointer to the actual array of characters. |
| 134 }; | 75 }; |
| 135 | 76 |
| 136 | 77 |
| 137 enum ParserLanguage { JAVASCRIPT, JSON }; | 78 // Initializes a UTF16Buffer as input stream, using one of a number |
| 79 // of strategies depending on the available character sources. |
| 80 class StreamInitializer { |
| 81 public: |
| 82 UTF16Buffer* Init(Handle<String> source, |
| 83 unibrow::CharacterStream* stream, |
| 84 int start_position, |
| 85 int end_position); |
| 86 private: |
| 87 // Different UTF16 buffers used to pull characters from. Based on input one of |
| 88 // these will be initialized as the actual data source. |
| 89 CharacterStreamUTF16Buffer char_stream_buffer_; |
| 90 ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t> |
| 91 two_byte_string_buffer_; |
| 92 ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_; |
| 93 |
| 94 // Used to convert the source string into a character stream when a stream |
| 95 // is not passed to the scanner. |
| 96 SafeStringInputBuffer safe_string_input_buffer_; |
| 97 }; |
| 98 |
| 99 // ---------------------------------------------------------------------------- |
| 100 // V8JavaScriptScanner |
| 101 // JavaScript scanner getting its input from either a V8 String or a unicode |
| 102 // CharacterStream. |
| 103 |
| 104 class V8JavaScriptScanner : public JavaScriptScanner { |
| 105 public: |
| 106 V8JavaScriptScanner() {} |
| 107 |
| 108 Token::Value NextCheckStack(); |
| 109 |
| 110 // Initialize the Scanner to scan source. |
| 111 void Initialize(Handle<String> source); |
| 112 void Initialize(Handle<String> source, |
| 113 unibrow::CharacterStream* stream); |
| 114 void Initialize(Handle<String> source, |
| 115 int start_position, int end_position); |
| 116 |
| 117 protected: |
| 118 StreamInitializer stream_initializer_; |
| 119 }; |
| 138 | 120 |
| 139 | 121 |
| 140 class Scanner { | 122 class JsonScanner : public Scanner { |
| 141 public: | 123 public: |
| 142 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; | 124 JsonScanner(); |
| 143 | |
| 144 class LiteralScope { | |
| 145 public: | |
| 146 explicit LiteralScope(Scanner* self); | |
| 147 ~LiteralScope(); | |
| 148 void Complete(); | |
| 149 | |
| 150 private: | |
| 151 Scanner* scanner_; | |
| 152 bool complete_; | |
| 153 }; | |
| 154 | |
| 155 Scanner(); | |
| 156 | 125 |
| 157 // Initialize the Scanner to scan source. | 126 // Initialize the Scanner to scan source. |
| 158 void Initialize(Handle<String> source, | 127 void Initialize(Handle<String> source); |
| 159 ParserLanguage language); | |
| 160 void Initialize(Handle<String> source, | |
| 161 unibrow::CharacterStream* stream, | |
| 162 ParserLanguage language); | |
| 163 void Initialize(Handle<String> source, | |
| 164 int start_position, int end_position, | |
| 165 ParserLanguage language); | |
| 166 | 128 |
| 167 // Returns the next token. | 129 // Returns the next token. |
| 168 Token::Value Next(); | 130 Token::Value Next(); |
| 169 | 131 |
| 170 // Returns the current token again. | 132 protected: |
| 171 Token::Value current_token() { return current_.token; } | 133 // Skip past JSON whitespace (only space, tab, newline and carrige-return). |
| 172 | |
| 173 // One token look-ahead (past the token returned by Next()). | |
| 174 Token::Value peek() const { return next_.token; } | |
| 175 | |
| 176 // Returns true if there was a line terminator before the peek'ed token. | |
| 177 bool has_line_terminator_before_next() const { | |
| 178 return has_line_terminator_before_next_; | |
| 179 } | |
| 180 | |
| 181 struct Location { | |
| 182 Location(int b, int e) : beg_pos(b), end_pos(e) { } | |
| 183 Location() : beg_pos(0), end_pos(0) { } | |
| 184 int beg_pos; | |
| 185 int end_pos; | |
| 186 }; | |
| 187 | |
| 188 // Returns the location information for the current token | |
| 189 // (the token returned by Next()). | |
| 190 Location location() const { return current_.location; } | |
| 191 Location peek_location() const { return next_.location; } | |
| 192 | |
| 193 // Returns the literal string, if any, for the current token (the | |
| 194 // token returned by Next()). The string is 0-terminated and in | |
| 195 // UTF-8 format; they may contain 0-characters. Literal strings are | |
| 196 // collected for identifiers, strings, and numbers. | |
| 197 // These functions only give the correct result if the literal | |
| 198 // was scanned between calls to StartLiteral() and TerminateLiteral(). | |
| 199 const char* literal_string() const { | |
| 200 return current_.literal_chars.start(); | |
| 201 } | |
| 202 | |
| 203 int literal_length() const { | |
| 204 // Excluding terminal '\x00' added by TerminateLiteral(). | |
| 205 return current_.literal_chars.length() - 1; | |
| 206 } | |
| 207 | |
| 208 Vector<const char> literal() const { | |
| 209 return Vector<const char>(literal_string(), literal_length()); | |
| 210 } | |
| 211 | |
| 212 // Returns the literal string for the next token (the token that | |
| 213 // would be returned if Next() were called). | |
| 214 const char* next_literal_string() const { | |
| 215 return next_.literal_chars.start(); | |
| 216 } | |
| 217 | |
| 218 | |
| 219 // Returns the length of the next token (that would be returned if | |
| 220 // Next() were called). | |
| 221 int next_literal_length() const { | |
| 222 // Excluding terminal '\x00' added by TerminateLiteral(). | |
| 223 return next_.literal_chars.length() - 1; | |
| 224 } | |
| 225 | |
| 226 Vector<const char> next_literal() const { | |
| 227 return Vector<const char>(next_literal_string(), next_literal_length()); | |
| 228 } | |
| 229 | |
| 230 // Scans the input as a regular expression pattern, previous | |
| 231 // character(s) must be /(=). Returns true if a pattern is scanned. | |
| 232 bool ScanRegExpPattern(bool seen_equal); | |
| 233 // Returns true if regexp flags are scanned (always since flags can | |
| 234 // be empty). | |
| 235 bool ScanRegExpFlags(); | |
| 236 | |
| 237 // Seek forward to the given position. This operation does not | |
| 238 // work in general, for instance when there are pushed back | |
| 239 // characters, but works for seeking forward until simple delimiter | |
| 240 // tokens, which is what it is used for. | |
| 241 void SeekForward(int pos); | |
| 242 | |
| 243 bool stack_overflow() { return stack_overflow_; } | |
| 244 | |
| 245 // Tells whether the buffer contains an identifier (no escapes). | |
| 246 // Used for checking if a property name is an identifier. | |
| 247 static bool IsIdentifier(unibrow::CharacterStream* buffer); | |
| 248 | |
| 249 static const int kCharacterLookaheadBufferSize = 1; | |
| 250 static const int kNoEndPosition = 1; | |
| 251 | |
| 252 private: | |
| 253 // The current and look-ahead token. | |
| 254 struct TokenDesc { | |
| 255 Token::Value token; | |
| 256 Location location; | |
| 257 Vector<const char> literal_chars; | |
| 258 }; | |
| 259 | |
| 260 void Init(Handle<String> source, | |
| 261 unibrow::CharacterStream* stream, | |
| 262 int start_position, int end_position, | |
| 263 ParserLanguage language); | |
| 264 | |
| 265 // Literal buffer support | |
| 266 inline void StartLiteral(); | |
| 267 inline void AddLiteralChar(uc32 ch); | |
| 268 inline void AddLiteralCharAdvance(); | |
| 269 inline void TerminateLiteral(); | |
| 270 // Stops scanning of a literal, e.g., due to an encountered error. | |
| 271 inline void DropLiteral(); | |
| 272 | |
| 273 // Low-level scanning support. | |
| 274 void Advance() { c0_ = source_->Advance(); } | |
| 275 void PushBack(uc32 ch) { | |
| 276 source_->PushBack(ch); | |
| 277 c0_ = ch; | |
| 278 } | |
| 279 | |
| 280 bool SkipWhiteSpace() { | |
| 281 if (is_parsing_json_) { | |
| 282 return SkipJsonWhiteSpace(); | |
| 283 } else { | |
| 284 return SkipJavaScriptWhiteSpace(); | |
| 285 } | |
| 286 } | |
| 287 | |
| 288 bool SkipJavaScriptWhiteSpace(); | |
| 289 bool SkipJsonWhiteSpace(); | 134 bool SkipJsonWhiteSpace(); |
| 290 Token::Value SkipSingleLineComment(); | |
| 291 Token::Value SkipMultiLineComment(); | |
| 292 | |
| 293 inline Token::Value Select(Token::Value tok); | |
| 294 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_); | |
| 295 | |
| 296 inline void Scan() { | |
| 297 if (is_parsing_json_) { | |
| 298 ScanJson(); | |
| 299 } else { | |
| 300 ScanJavaScript(); | |
| 301 } | |
| 302 } | |
| 303 | |
| 304 // Scans a single JavaScript token. | |
| 305 void ScanJavaScript(); | |
| 306 | 135 |
| 307 // Scan a single JSON token. The JSON lexical grammar is specified in the | 136 // Scan a single JSON token. The JSON lexical grammar is specified in the |
| 308 // ECMAScript 5 standard, section 15.12.1.1. | 137 // ECMAScript 5 standard, section 15.12.1.1. |
| 309 // Recognizes all of the single-character tokens directly, or calls a function | 138 // Recognizes all of the single-character tokens directly, or calls a function |
| 310 // to scan a number, string or identifier literal. | 139 // to scan a number, string or identifier literal. |
| 311 // The only allowed whitespace characters between tokens are tab, | 140 // The only allowed whitespace characters between tokens are tab, |
| 312 // carrige-return, newline and space. | 141 // carrige-return, newline and space. |
| 313 void ScanJson(); | 142 void ScanJson(); |
| 314 | 143 |
| 315 // A JSON number (production JSONNumber) is a subset of the valid JavaScript | 144 // A JSON number (production JSONNumber) is a subset of the valid JavaScript |
| 316 // decimal number literals. | 145 // decimal number literals. |
| 317 // It includes an optional minus sign, must have at least one | 146 // It includes an optional minus sign, must have at least one |
| 318 // digit before and after a decimal point, may not have prefixed zeros (unless | 147 // digit before and after a decimal point, may not have prefixed zeros (unless |
| 319 // the integer part is zero), and may include an exponent part (e.g., "e-10"). | 148 // the integer part is zero), and may include an exponent part (e.g., "e-10"). |
| 320 // Hexadecimal and octal numbers are not allowed. | 149 // Hexadecimal and octal numbers are not allowed. |
| 321 Token::Value ScanJsonNumber(); | 150 Token::Value ScanJsonNumber(); |
| 322 | 151 |
| 323 // A JSON string (production JSONString) is subset of valid JavaScript string | 152 // A JSON string (production JSONString) is subset of valid JavaScript string |
| 324 // literals. The string must only be double-quoted (not single-quoted), and | 153 // literals. The string must only be double-quoted (not single-quoted), and |
| 325 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and | 154 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and |
| 326 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid. | 155 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid. |
| 327 Token::Value ScanJsonString(); | 156 Token::Value ScanJsonString(); |
| 328 | 157 |
| 329 // Used to recognizes one of the literals "true", "false", or "null". These | 158 // Used to recognizes one of the literals "true", "false", or "null". These |
| 330 // are the only valid JSON identifiers (productions JSONBooleanLiteral, | 159 // are the only valid JSON identifiers (productions JSONBooleanLiteral, |
| 331 // JSONNullLiteral). | 160 // JSONNullLiteral). |
| 332 Token::Value ScanJsonIdentifier(const char* text, Token::Value token); | 161 Token::Value ScanJsonIdentifier(const char* text, Token::Value token); |
| 333 | 162 |
| 334 void ScanDecimalDigits(); | 163 StreamInitializer stream_initializer_; |
| 335 Token::Value ScanNumber(bool seen_period); | |
| 336 Token::Value ScanIdentifier(); | |
| 337 uc32 ScanHexEscape(uc32 c, int length); | |
| 338 uc32 ScanOctalEscape(uc32 c, int length); | |
| 339 void ScanEscape(); | |
| 340 Token::Value ScanString(); | |
| 341 | |
| 342 // Scans a possible HTML comment -- begins with '<!'. | |
| 343 Token::Value ScanHtmlComment(); | |
| 344 | |
| 345 // Return the current source position. | |
| 346 int source_pos() { | |
| 347 return source_->pos() - kCharacterLookaheadBufferSize; | |
| 348 } | |
| 349 | |
| 350 // Decodes a unicode escape-sequence which is part of an identifier. | |
| 351 // If the escape sequence cannot be decoded the result is kBadRune. | |
| 352 uc32 ScanIdentifierUnicodeEscape(); | |
| 353 | |
| 354 TokenDesc current_; // desc for current token (as returned by Next()) | |
| 355 TokenDesc next_; // desc for next token (one token look-ahead) | |
| 356 bool has_line_terminator_before_next_; | |
| 357 bool is_parsing_json_; | |
| 358 | |
| 359 // Different UTF16 buffers used to pull characters from. Based on input one of | |
| 360 // these will be initialized as the actual data source. | |
| 361 CharacterStreamUTF16Buffer char_stream_buffer_; | |
| 362 ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t> | |
| 363 two_byte_string_buffer_; | |
| 364 ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_; | |
| 365 | |
| 366 // Source. Will point to one of the buffers declared above. | |
| 367 UTF16Buffer* source_; | |
| 368 | |
| 369 // Used to convert the source string into a character stream when a stream | |
| 370 // is not passed to the scanner. | |
| 371 SafeStringInputBuffer safe_string_input_buffer_; | |
| 372 | |
| 373 // Buffer to hold literal values (identifiers, strings, numbers) | |
| 374 // using '\x00'-terminated UTF-8 encoding. Handles allocation internally. | |
| 375 UTF8Buffer literal_buffer_; | |
| 376 | |
| 377 bool stack_overflow_; | |
| 378 | |
| 379 // One Unicode character look-ahead; c0_ < 0 at the end of the input. | |
| 380 uc32 c0_; | |
| 381 }; | 164 }; |
| 382 | 165 |
| 383 | 166 |
| 384 // ExternalStringUTF16Buffer | 167 // ExternalStringUTF16Buffer |
| 385 template <typename StringType, typename CharType> | 168 template <typename StringType, typename CharType> |
| 386 ExternalStringUTF16Buffer<StringType, CharType>::ExternalStringUTF16Buffer() | 169 ExternalStringUTF16Buffer<StringType, CharType>::ExternalStringUTF16Buffer() |
| 387 : raw_data_(NULL) { } | 170 : raw_data_(NULL) { } |
| 388 | 171 |
| 389 | 172 |
| 390 template <typename StringType, typename CharType> | 173 template <typename StringType, typename CharType> |
| 391 void ExternalStringUTF16Buffer<StringType, CharType>::Initialize( | 174 void ExternalStringUTF16Buffer<StringType, CharType>::Initialize( |
| 392 Handle<StringType> data, | 175 Handle<StringType> data, |
| 393 int start_position, | 176 int start_position, |
| 394 int end_position) { | 177 int end_position) { |
| 395 ASSERT(!data.is_null()); | 178 ASSERT(!data.is_null()); |
| 396 raw_data_ = data->resource()->data(); | 179 raw_data_ = data->resource()->data(); |
| 397 | 180 |
| 398 ASSERT(end_position <= data->length()); | 181 ASSERT(end_position <= data->length()); |
| 399 if (start_position > 0) { | 182 if (start_position > 0) { |
| 400 SeekForward(start_position); | 183 SeekForward(start_position); |
| 401 } | 184 } |
| 402 end_ = | 185 end_ = |
| 403 end_position != Scanner::kNoEndPosition ? end_position : data->length(); | 186 end_position != kNoEndPosition ? end_position : data->length(); |
| 404 } | 187 } |
| 405 | 188 |
| 406 | 189 |
| 407 template <typename StringType, typename CharType> | 190 template <typename StringType, typename CharType> |
| 408 uc32 ExternalStringUTF16Buffer<StringType, CharType>::Advance() { | 191 uc32 ExternalStringUTF16Buffer<StringType, CharType>::Advance() { |
| 409 if (pos_ < end_) { | 192 if (pos_ < end_) { |
| 410 return raw_data_[pos_++]; | 193 return raw_data_[pos_++]; |
| 411 } else { | 194 } else { |
| 412 // note: currently the following increment is necessary to avoid a | 195 // note: currently the following increment is necessary to avoid a |
| 413 // test-parser problem! | 196 // test-parser problem! |
| (...skipping 12 matching lines...) Expand all Loading... |
| 426 | 209 |
| 427 | 210 |
| 428 template <typename StringType, typename CharType> | 211 template <typename StringType, typename CharType> |
| 429 void ExternalStringUTF16Buffer<StringType, CharType>::SeekForward(int pos) { | 212 void ExternalStringUTF16Buffer<StringType, CharType>::SeekForward(int pos) { |
| 430 pos_ = pos; | 213 pos_ = pos; |
| 431 } | 214 } |
| 432 | 215 |
| 433 } } // namespace v8::internal | 216 } } // namespace v8::internal |
| 434 | 217 |
| 435 #endif // V8_SCANNER_H_ | 218 #endif // V8_SCANNER_H_ |
| OLD | NEW |