| OLD | NEW |
| 1 // Copyright 2010 the V8 project authors. All rights reserved. | 1 // Copyright 2010 the V8 project authors. All rights reserved. |
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
| 4 // met: | 4 // met: |
| 5 // | 5 // |
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
| (...skipping 131 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 142 unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; | 142 unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; |
| 143 unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; | 143 unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; |
| 144 unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; | 144 unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; |
| 145 StaticResource<Utf8Decoder> utf8_decoder_; | 145 StaticResource<Utf8Decoder> utf8_decoder_; |
| 146 | 146 |
| 147 friend class Isolate; | 147 friend class Isolate; |
| 148 DISALLOW_COPY_AND_ASSIGN(ScannerConstants); | 148 DISALLOW_COPY_AND_ASSIGN(ScannerConstants); |
| 149 }; | 149 }; |
| 150 | 150 |
| 151 // ---------------------------------------------------------------------------- | 151 // ---------------------------------------------------------------------------- |
| 152 // LiteralCollector - Collector of chars of literals. | 152 // LiteralBuffer - Collector of chars of literals. |
| 153 | 153 |
| 154 class LiteralCollector { | 154 class LiteralBuffer { |
| 155 public: | 155 public: |
| 156 LiteralCollector(); | 156 LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { } |
| 157 ~LiteralCollector(); | |
| 158 | 157 |
| 159 inline void AddChar(uc32 c) { | 158 ~LiteralBuffer() { |
| 160 if (recording_) { | 159 if (backing_store_.length() > 0) { |
| 161 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { | 160 backing_store_.Dispose(); |
| 162 buffer_.Add(static_cast<char>(c)); | |
| 163 } else { | |
| 164 AddCharSlow(c); | |
| 165 } | |
| 166 } | 161 } |
| 167 } | 162 } |
| 168 | 163 |
| 169 void StartLiteral() { | 164 inline void AddChar(uc16 character) { |
| 170 buffer_.StartSequence(); | 165 if (position_ >= backing_store_.length()) ExpandBuffer(); |
| 171 recording_ = true; | 166 if (is_ascii_) { |
| 167 if (character < kMaxAsciiCharCodeU) { |
| 168 backing_store_[position_] = static_cast<byte>(character); |
| 169 position_ += kASCIISize; |
| 170 return; |
| 171 } |
| 172 ConvertToUC16(); |
| 173 } |
| 174 *reinterpret_cast<uc16*>(&backing_store_[position_]) = character; |
| 175 position_ += kUC16Size; |
| 172 } | 176 } |
| 173 | 177 |
| 174 Vector<const char> EndLiteral() { | 178 bool is_ascii() { return is_ascii_; } |
| 175 if (recording_) { | 179 |
| 176 recording_ = false; | 180 Vector<const uc16> uc16_literal() { |
| 177 buffer_.Add(kEndMarker); | 181 ASSERT(!is_ascii_); |
| 178 Vector<char> sequence = buffer_.EndSequence(); | 182 ASSERT((position_ & 0x1) == 0); |
| 179 return Vector<const char>(sequence.start(), sequence.length()); | 183 return Vector<const uc16>( |
| 180 } | 184 reinterpret_cast<const uc16*>(backing_store_.start()), |
| 181 return Vector<const char>(); | 185 position_ >> 1); |
| 182 } | 186 } |
| 183 | 187 |
| 184 void DropLiteral() { | 188 Vector<const char> ascii_literal() { |
| 185 if (recording_) { | 189 ASSERT(is_ascii_); |
| 186 recording_ = false; | 190 return Vector<const char>( |
| 187 buffer_.DropSequence(); | 191 reinterpret_cast<const char*>(backing_store_.start()), |
| 188 } | 192 position_); |
| 193 } |
| 194 |
| 195 int length() { |
| 196 return is_ascii_ ? position_ : (position_ >> 1); |
| 189 } | 197 } |
| 190 | 198 |
| 191 void Reset() { | 199 void Reset() { |
| 192 buffer_.Reset(); | 200 position_ = 0; |
| 201 is_ascii_ = true; |
| 202 } |
| 203 private: |
| 204 static const int kInitialCapacity = 16; |
| 205 static const int kGrowthFactory = 4; |
| 206 static const int kMinConversionSlack = 256; |
| 207 static const int kMaxGrowth = 1 * MB; |
| 208 inline int NewCapacity(int min_capacity) { |
| 209 int capacity = Max(min_capacity, backing_store_.length()); |
| 210 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth); |
| 211 return new_capacity; |
| 193 } | 212 } |
| 194 | 213 |
| 195 // The end marker added after a parsed literal. | 214 void ExpandBuffer() { |
| 196 // Using zero allows the usage of strlen and similar functions on | 215 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity)); |
| 197 // identifiers and numbers (but not strings, since they may contain zero | 216 memcpy(new_store.start(), backing_store_.start(), position_); |
| 198 // bytes). | 217 backing_store_.Dispose(); |
| 199 static const char kEndMarker = '\x00'; | 218 backing_store_ = new_store; |
| 200 private: | 219 } |
| 201 static const int kInitialCapacity = 256; | 220 |
| 202 SequenceCollector<char, 4> buffer_; | 221 void ConvertToUC16() { |
| 203 bool recording_; | 222 ASSERT(is_ascii_); |
| 204 void AddCharSlow(uc32 c); | 223 Vector<byte> new_store; |
| 224 int new_content_size = position_ * kUC16Size; |
| 225 if (new_content_size >= backing_store_.length()) { |
| 226 // Ensure room for all currently read characters as UC16 as well |
| 227 // as the character about to be stored. |
| 228 new_store = Vector<byte>::New(NewCapacity(new_content_size)); |
| 229 } else { |
| 230 new_store = backing_store_; |
| 231 } |
| 232 char* src = reinterpret_cast<char*>(backing_store_.start()); |
| 233 uc16* dst = reinterpret_cast<uc16*>(new_store.start()); |
| 234 for (int i = position_ - 1; i >= 0; i--) { |
| 235 dst[i] = src[i]; |
| 236 } |
| 237 if (new_store.start() != backing_store_.start()) { |
| 238 backing_store_.Dispose(); |
| 239 backing_store_ = new_store; |
| 240 } |
| 241 position_ = new_content_size; |
| 242 is_ascii_ = false; |
| 243 } |
| 244 |
| 245 bool is_ascii_; |
| 246 int position_; |
| 247 Vector<byte> backing_store_; |
| 205 }; | 248 }; |
| 206 | 249 |
| 250 |
| 207 // ---------------------------------------------------------------------------- | 251 // ---------------------------------------------------------------------------- |
| 208 // Scanner base-class. | 252 // Scanner base-class. |
| 209 | 253 |
| 210 // Generic functionality used by both JSON and JavaScript scanners. | 254 // Generic functionality used by both JSON and JavaScript scanners. |
| 211 class Scanner { | 255 class Scanner { |
| 212 public: | 256 public: |
| 213 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; | 257 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; |
| 214 | 258 |
| 215 class LiteralScope { | 259 class LiteralScope { |
| 216 public: | 260 public: |
| (...skipping 25 matching lines...) Expand all Loading... |
| 242 // (the token returned by Next()). | 286 // (the token returned by Next()). |
| 243 Location location() const { return current_.location; } | 287 Location location() const { return current_.location; } |
| 244 Location peek_location() const { return next_.location; } | 288 Location peek_location() const { return next_.location; } |
| 245 | 289 |
| 246 // Returns the literal string, if any, for the current token (the | 290 // Returns the literal string, if any, for the current token (the |
| 247 // token returned by Next()). The string is 0-terminated and in | 291 // token returned by Next()). The string is 0-terminated and in |
| 248 // UTF-8 format; they may contain 0-characters. Literal strings are | 292 // UTF-8 format; they may contain 0-characters. Literal strings are |
| 249 // collected for identifiers, strings, and numbers. | 293 // collected for identifiers, strings, and numbers. |
| 250 // These functions only give the correct result if the literal | 294 // These functions only give the correct result if the literal |
| 251 // was scanned between calls to StartLiteral() and TerminateLiteral(). | 295 // was scanned between calls to StartLiteral() and TerminateLiteral(). |
| 252 const char* literal_string() const { | 296 bool is_literal_ascii() { |
| 253 return current_.literal_chars.start(); | 297 ASSERT_NOT_NULL(current_.literal_chars); |
| 298 return current_.literal_chars->is_ascii(); |
| 254 } | 299 } |
| 255 | 300 Vector<const char> literal_ascii_string() { |
| 301 ASSERT_NOT_NULL(current_.literal_chars); |
| 302 return current_.literal_chars->ascii_literal(); |
| 303 } |
| 304 Vector<const uc16> literal_uc16_string() { |
| 305 ASSERT_NOT_NULL(current_.literal_chars); |
| 306 return current_.literal_chars->uc16_literal(); |
| 307 } |
| 256 int literal_length() const { | 308 int literal_length() const { |
| 257 // Excluding terminal '\x00' added by TerminateLiteral(). | 309 ASSERT_NOT_NULL(current_.literal_chars); |
| 258 return current_.literal_chars.length() - 1; | 310 return current_.literal_chars->length(); |
| 259 } | |
| 260 | |
| 261 Vector<const char> literal() const { | |
| 262 return Vector<const char>(literal_string(), literal_length()); | |
| 263 } | 311 } |
| 264 | 312 |
| 265 // Returns the literal string for the next token (the token that | 313 // Returns the literal string for the next token (the token that |
| 266 // would be returned if Next() were called). | 314 // would be returned if Next() were called). |
| 267 const char* next_literal_string() const { | 315 bool is_next_literal_ascii() { |
| 268 return next_.literal_chars.start(); | 316 ASSERT_NOT_NULL(next_.literal_chars); |
| 317 return next_.literal_chars->is_ascii(); |
| 269 } | 318 } |
| 270 | 319 Vector<const char> next_literal_ascii_string() { |
| 271 | 320 ASSERT_NOT_NULL(next_.literal_chars); |
| 272 // Returns the length of the next token (that would be returned if | 321 return next_.literal_chars->ascii_literal(); |
| 273 // Next() were called). | 322 } |
| 323 Vector<const uc16> next_literal_uc16_string() { |
| 324 ASSERT_NOT_NULL(next_.literal_chars); |
| 325 return next_.literal_chars->uc16_literal(); |
| 326 } |
| 274 int next_literal_length() const { | 327 int next_literal_length() const { |
| 275 // Excluding terminal '\x00' added by TerminateLiteral(). | 328 ASSERT_NOT_NULL(next_.literal_chars); |
| 276 return next_.literal_chars.length() - 1; | 329 return next_.literal_chars->length(); |
| 277 } | |
| 278 | |
| 279 Vector<const char> next_literal() const { | |
| 280 return Vector<const char>(next_literal_string(), next_literal_length()); | |
| 281 } | 330 } |
| 282 | 331 |
| 283 static const int kCharacterLookaheadBufferSize = 1; | 332 static const int kCharacterLookaheadBufferSize = 1; |
| 284 | 333 |
| 285 protected: | 334 protected: |
| 286 // The current and look-ahead token. | 335 // The current and look-ahead token. |
| 287 struct TokenDesc { | 336 struct TokenDesc { |
| 288 Token::Value token; | 337 Token::Value token; |
| 289 Location location; | 338 Location location; |
| 290 Vector<const char> literal_chars; | 339 LiteralBuffer* literal_chars; |
| 291 }; | 340 }; |
| 292 | 341 |
| 293 // Call this after setting source_ to the input. | 342 // Call this after setting source_ to the input. |
| 294 void Init() { | 343 void Init() { |
| 295 // Set c0_ (one character ahead) | 344 // Set c0_ (one character ahead) |
| 296 ASSERT(kCharacterLookaheadBufferSize == 1); | 345 ASSERT(kCharacterLookaheadBufferSize == 1); |
| 297 Advance(); | 346 Advance(); |
| 298 // Initialize current_ to not refer to a literal. | 347 // Initialize current_ to not refer to a literal. |
| 299 current_.literal_chars = Vector<const char>(); | 348 current_.literal_chars = NULL; |
| 300 // Reset literal buffer. | |
| 301 literal_buffer_.Reset(); | |
| 302 } | 349 } |
| 303 | 350 |
| 304 // Literal buffer support | 351 // Literal buffer support |
| 305 inline void StartLiteral() { | 352 inline void StartLiteral() { |
| 306 literal_buffer_.StartLiteral(); | 353 LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ? |
| 354 &literal_buffer2_ : &literal_buffer1_; |
| 355 free_buffer->Reset(); |
| 356 next_.literal_chars = free_buffer; |
| 307 } | 357 } |
| 308 | 358 |
| 309 inline void AddLiteralChar(uc32 c) { | 359 inline void AddLiteralChar(uc32 c) { |
| 310 literal_buffer_.AddChar(c); | 360 ASSERT_NOT_NULL(next_.literal_chars); |
| 361 next_.literal_chars->AddChar(c); |
| 311 } | 362 } |
| 312 | 363 |
| 313 // Complete scanning of a literal. | 364 // Complete scanning of a literal. |
| 314 inline void TerminateLiteral() { | 365 inline void TerminateLiteral() { |
| 315 next_.literal_chars = literal_buffer_.EndLiteral(); | 366 // Does nothing in the current implementation. |
| 316 } | 367 } |
| 317 | 368 |
| 318 // Stops scanning of a literal and drop the collected characters, | 369 // Stops scanning of a literal and drop the collected characters, |
| 319 // e.g., due to an encountered error. | 370 // e.g., due to an encountered error. |
| 320 inline void DropLiteral() { | 371 inline void DropLiteral() { |
| 321 literal_buffer_.DropLiteral(); | 372 next_.literal_chars = NULL; |
| 322 } | 373 } |
| 323 | 374 |
| 324 inline void AddLiteralCharAdvance() { | 375 inline void AddLiteralCharAdvance() { |
| 325 AddLiteralChar(c0_); | 376 AddLiteralChar(c0_); |
| 326 Advance(); | 377 Advance(); |
| 327 } | 378 } |
| 328 | 379 |
| 329 // Low-level scanning support. | 380 // Low-level scanning support. |
| 330 void Advance() { c0_ = source_->Advance(); } | 381 void Advance() { c0_ = source_->Advance(); } |
| 331 void PushBack(uc32 ch) { | 382 void PushBack(uc32 ch) { |
| (...skipping 19 matching lines...) Expand all Loading... |
| 351 uc32 ScanHexEscape(uc32 c, int length); | 402 uc32 ScanHexEscape(uc32 c, int length); |
| 352 uc32 ScanOctalEscape(uc32 c, int length); | 403 uc32 ScanOctalEscape(uc32 c, int length); |
| 353 | 404 |
| 354 // Return the current source position. | 405 // Return the current source position. |
| 355 int source_pos() { | 406 int source_pos() { |
| 356 return source_->pos() - kCharacterLookaheadBufferSize; | 407 return source_->pos() - kCharacterLookaheadBufferSize; |
| 357 } | 408 } |
| 358 | 409 |
| 359 ScannerConstants* scanner_constants_; | 410 ScannerConstants* scanner_constants_; |
| 360 | 411 |
| 412 // Buffers collecting literal strings, numbers, etc. |
| 413 LiteralBuffer literal_buffer1_; |
| 414 LiteralBuffer literal_buffer2_; |
| 415 |
| 361 TokenDesc current_; // desc for current token (as returned by Next()) | 416 TokenDesc current_; // desc for current token (as returned by Next()) |
| 362 TokenDesc next_; // desc for next token (one token look-ahead) | 417 TokenDesc next_; // desc for next token (one token look-ahead) |
| 363 | 418 |
| 364 // Input stream. Must be initialized to an UC16CharacterStream. | 419 // Input stream. Must be initialized to an UC16CharacterStream. |
| 365 UC16CharacterStream* source_; | 420 UC16CharacterStream* source_; |
| 366 | 421 |
| 367 // Buffer to hold literal values (identifiers, strings, numbers) | |
| 368 // using '\x00'-terminated UTF-8 encoding. Handles allocation internally. | |
| 369 LiteralCollector literal_buffer_; | |
| 370 | 422 |
| 371 // One Unicode character look-ahead; c0_ < 0 at the end of the input. | 423 // One Unicode character look-ahead; c0_ < 0 at the end of the input. |
| 372 uc32 c0_; | 424 uc32 c0_; |
| 373 }; | 425 }; |
| 374 | 426 |
| 375 // ---------------------------------------------------------------------------- | 427 // ---------------------------------------------------------------------------- |
| 376 // JavaScriptScanner - base logic for JavaScript scanning. | 428 // JavaScriptScanner - base logic for JavaScript scanning. |
| 377 | 429 |
| 378 class JavaScriptScanner : public Scanner { | 430 class JavaScriptScanner : public Scanner { |
| 379 public: | 431 public: |
| 380 | |
| 381 // Bit vector representing set of types of literals. | |
| 382 enum LiteralType { | |
| 383 kNoLiterals = 0, | |
| 384 kLiteralNumber = 1, | |
| 385 kLiteralIdentifier = 2, | |
| 386 kLiteralString = 4, | |
| 387 kLiteralRegExp = 8, | |
| 388 kLiteralRegExpFlags = 16, | |
| 389 kAllLiterals = 31 | |
| 390 }; | |
| 391 | |
| 392 // A LiteralScope that disables recording of some types of JavaScript | 432 // A LiteralScope that disables recording of some types of JavaScript |
| 393 // literals. If the scanner is configured to not record the specific | 433 // literals. If the scanner is configured to not record the specific |
| 394 // type of literal, the scope will not call StartLiteral. | 434 // type of literal, the scope will not call StartLiteral. |
| 395 class LiteralScope { | 435 class LiteralScope { |
| 396 public: | 436 public: |
| 397 LiteralScope(JavaScriptScanner* self, LiteralType type) | 437 explicit LiteralScope(JavaScriptScanner* self) |
| 398 : scanner_(self), complete_(false) { | 438 : scanner_(self), complete_(false) { |
| 399 if (scanner_->RecordsLiteral(type)) { | 439 scanner_->StartLiteral(); |
| 400 scanner_->StartLiteral(); | |
| 401 } | |
| 402 } | 440 } |
| 403 ~LiteralScope() { | 441 ~LiteralScope() { |
| 404 if (!complete_) scanner_->DropLiteral(); | 442 if (!complete_) scanner_->DropLiteral(); |
| 405 } | 443 } |
| 406 void Complete() { | 444 void Complete() { |
| 407 scanner_->TerminateLiteral(); | 445 scanner_->TerminateLiteral(); |
| 408 complete_ = true; | 446 complete_ = true; |
| 409 } | 447 } |
| 410 | 448 |
| 411 private: | 449 private: |
| (...skipping 21 matching lines...) Expand all Loading... |
| 433 // Tells whether the buffer contains an identifier (no escapes). | 471 // Tells whether the buffer contains an identifier (no escapes). |
| 434 // Used for checking if a property name is an identifier. | 472 // Used for checking if a property name is an identifier. |
| 435 static bool IsIdentifier(unibrow::CharacterStream* buffer); | 473 static bool IsIdentifier(unibrow::CharacterStream* buffer); |
| 436 | 474 |
| 437 // Seek forward to the given position. This operation does not | 475 // Seek forward to the given position. This operation does not |
| 438 // work in general, for instance when there are pushed back | 476 // work in general, for instance when there are pushed back |
| 439 // characters, but works for seeking forward until simple delimiter | 477 // characters, but works for seeking forward until simple delimiter |
| 440 // tokens, which is what it is used for. | 478 // tokens, which is what it is used for. |
| 441 void SeekForward(int pos); | 479 void SeekForward(int pos); |
| 442 | 480 |
| 443 // Whether this scanner records the given literal type or not. | |
| 444 bool RecordsLiteral(LiteralType type) { | |
| 445 return (literal_flags_ & type) != 0; | |
| 446 } | |
| 447 | |
| 448 protected: | 481 protected: |
| 449 bool SkipWhiteSpace(); | 482 bool SkipWhiteSpace(); |
| 450 Token::Value SkipSingleLineComment(); | 483 Token::Value SkipSingleLineComment(); |
| 451 Token::Value SkipMultiLineComment(); | 484 Token::Value SkipMultiLineComment(); |
| 452 | 485 |
| 453 // Scans a single JavaScript token. | 486 // Scans a single JavaScript token. |
| 454 void Scan(); | 487 void Scan(); |
| 455 | 488 |
| 456 void ScanDecimalDigits(); | 489 void ScanDecimalDigits(); |
| 457 Token::Value ScanNumber(bool seen_period); | 490 Token::Value ScanNumber(bool seen_period); |
| 458 Token::Value ScanIdentifierOrKeyword(); | 491 Token::Value ScanIdentifierOrKeyword(); |
| 459 Token::Value ScanIdentifierSuffix(LiteralScope* literal); | 492 Token::Value ScanIdentifierSuffix(LiteralScope* literal); |
| 460 | 493 |
| 461 void ScanEscape(); | 494 void ScanEscape(); |
| 462 Token::Value ScanString(); | 495 Token::Value ScanString(); |
| 463 | 496 |
| 464 // Scans a possible HTML comment -- begins with '<!'. | 497 // Scans a possible HTML comment -- begins with '<!'. |
| 465 Token::Value ScanHtmlComment(); | 498 Token::Value ScanHtmlComment(); |
| 466 | 499 |
| 467 // Decodes a unicode escape-sequence which is part of an identifier. | 500 // Decodes a unicode escape-sequence which is part of an identifier. |
| 468 // If the escape sequence cannot be decoded the result is kBadChar. | 501 // If the escape sequence cannot be decoded the result is kBadChar. |
| 469 uc32 ScanIdentifierUnicodeEscape(); | 502 uc32 ScanIdentifierUnicodeEscape(); |
| 470 | 503 |
| 471 int literal_flags_; | |
| 472 bool has_line_terminator_before_next_; | 504 bool has_line_terminator_before_next_; |
| 473 }; | 505 }; |
| 474 | 506 |
| 475 | 507 |
| 476 // ---------------------------------------------------------------------------- | 508 // ---------------------------------------------------------------------------- |
| 477 // Keyword matching state machine. | 509 // Keyword matching state machine. |
| 478 | 510 |
| 479 class KeywordMatcher { | 511 class KeywordMatcher { |
| 480 // Incrementally recognize keywords. | 512 // Incrementally recognize keywords. |
| 481 // | 513 // |
| (...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 594 // keyword with the current prefix). | 626 // keyword with the current prefix). |
| 595 const char* keyword_; | 627 const char* keyword_; |
| 596 int counter_; | 628 int counter_; |
| 597 Token::Value keyword_token_; | 629 Token::Value keyword_token_; |
| 598 }; | 630 }; |
| 599 | 631 |
| 600 | 632 |
| 601 } } // namespace v8::internal | 633 } } // namespace v8::internal |
| 602 | 634 |
| 603 #endif // V8_SCANNER_BASE_H_ | 635 #endif // V8_SCANNER_BASE_H_ |
| OLD | NEW |