| OLD | NEW |
| 1 // Copyright 2010 the V8 project authors. All rights reserved. | 1 // Copyright 2010 the V8 project authors. All rights reserved. |
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
| 4 // met: | 4 // met: |
| 5 // | 5 // |
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
| (...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 134 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; | 134 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; |
| 135 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; | 135 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; |
| 136 | 136 |
| 137 static bool IsIdentifier(unibrow::CharacterStream* buffer); | 137 static bool IsIdentifier(unibrow::CharacterStream* buffer); |
| 138 | 138 |
| 139 private: | 139 private: |
| 140 static StaticResource<Utf8Decoder> utf8_decoder_; | 140 static StaticResource<Utf8Decoder> utf8_decoder_; |
| 141 }; | 141 }; |
| 142 | 142 |
| 143 // ---------------------------------------------------------------------------- | 143 // ---------------------------------------------------------------------------- |
| 144 // LiteralCollector - Collector of chars of literals. | 144 // LiteralBuffer - Collector of chars of literals. |
| 145 | 145 |
| 146 class LiteralCollector { | 146 class LiteralBuffer { |
| 147 public: | 147 public: |
| 148 LiteralCollector(); | 148 LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { } |
| 149 ~LiteralCollector(); | |
| 150 | 149 |
| 151 inline void AddChar(uc32 c) { | 150 ~LiteralBuffer() { |
| 152 if (recording_) { | 151 if (backing_store_.length() > 0) { |
| 153 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { | 152 backing_store_.Dispose(); |
| 154 buffer_.Add(static_cast<char>(c)); | |
| 155 } else { | |
| 156 AddCharSlow(c); | |
| 157 } | |
| 158 } | 153 } |
| 159 } | 154 } |
| 160 | 155 |
| 161 void StartLiteral() { | 156 inline void AddChar(uc16 character) { |
| 162 buffer_.StartSequence(); | 157 if (position_ >= backing_store_.length()) ExpandBuffer(); |
| 163 recording_ = true; | 158 if (is_ascii_) { |
| 159 if (character < kMaxAsciiCharCodeU) { |
| 160 backing_store_[position_] = static_cast<byte>(character); |
| 161 position_ += kASCIISize; |
| 162 return; |
| 163 } |
| 164 ConvertToUC16(); |
| 165 } |
| 166 *reinterpret_cast<uc16*>(&backing_store_[position_]) = character; |
| 167 position_ += kUC16Size; |
| 164 } | 168 } |
| 165 | 169 |
| 166 Vector<const char> EndLiteral() { | 170 bool is_ascii() { return is_ascii_; } |
| 167 if (recording_) { | 171 |
| 168 recording_ = false; | 172 Vector<const uc16> uc16_literal() { |
| 169 buffer_.Add(kEndMarker); | 173 ASSERT(!is_ascii_); |
| 170 Vector<char> sequence = buffer_.EndSequence(); | 174 ASSERT((position_ & 0x1) == 0); |
| 171 return Vector<const char>(sequence.start(), sequence.length()); | 175 return Vector<const uc16>( |
| 172 } | 176 reinterpret_cast<const uc16*>(backing_store_.start()), |
| 173 return Vector<const char>(); | 177 position_ >> 1); |
| 174 } | 178 } |
| 175 | 179 |
| 176 void DropLiteral() { | 180 Vector<const char> ascii_literal() { |
| 177 if (recording_) { | 181 ASSERT(is_ascii_); |
| 178 recording_ = false; | 182 return Vector<const char>( |
| 179 buffer_.DropSequence(); | 183 reinterpret_cast<const char*>(backing_store_.start()), |
| 180 } | 184 position_); |
| 185 } |
| 186 |
| 187 int length() { |
| 188 return is_ascii_ ? position_ : (position_ >> 1); |
| 181 } | 189 } |
| 182 | 190 |
| 183 void Reset() { | 191 void Reset() { |
| 184 buffer_.Reset(); | 192 position_ = 0; |
| 193 is_ascii_ = true; |
| 194 } |
| 195 private: |
| 196 static const int kInitialCapacity = 16; |
| 197 static const int kGrowthFactory = 4; |
| 198 static const int kMinConversionSlack = 256; |
| 199 static const int kMaxGrowth = 1 * MB; |
| 200 inline int NewCapacity(int min_capacity) { |
| 201 int capacity = Max(min_capacity, backing_store_.length()); |
| 202 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth); |
| 203 return new_capacity; |
| 185 } | 204 } |
| 186 | 205 |
| 187 // The end marker added after a parsed literal. | 206 void ExpandBuffer() { |
| 188 // Using zero allows the usage of strlen and similar functions on | 207 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity)); |
| 189 // identifiers and numbers (but not strings, since they may contain zero | 208 memcpy(new_store.start(), backing_store_.start(), position_); |
| 190 // bytes). | 209 backing_store_.Dispose(); |
| 191 static const char kEndMarker = '\x00'; | 210 backing_store_ = new_store; |
| 192 private: | 211 } |
| 193 static const int kInitialCapacity = 256; | 212 |
| 194 SequenceCollector<char, 4> buffer_; | 213 void ConvertToUC16() { |
| 195 bool recording_; | 214 ASSERT(is_ascii_); |
| 196 void AddCharSlow(uc32 c); | 215 Vector<byte> new_store; |
| 216 int new_content_size = position_ * kUC16Size; |
| 217 if (new_content_size > backing_store_.length()) { |
| 218 new_store = Vector<byte>::New(NewCapacity(new_content_size)); |
| 219 } else { |
| 220 new_store = backing_store_; |
| 221 } |
| 222 char* src = reinterpret_cast<char*>(backing_store_.start()); |
| 223 uc16* dst = reinterpret_cast<uc16*>(new_store.start()); |
| 224 for (int i = position_ - 1; i >= 0; i--) { |
| 225 dst[i] = src[i]; |
| 226 } |
| 227 if (new_store.start() != backing_store_.start()) { |
| 228 backing_store_.Dispose(); |
| 229 backing_store_ = new_store; |
| 230 } |
| 231 position_ = new_content_size; |
| 232 is_ascii_ = false; |
| 233 } |
| 234 |
| 235 bool is_ascii_; |
| 236 int position_; |
| 237 Vector<byte> backing_store_; |
| 197 }; | 238 }; |
| 198 | 239 |
| 240 |
| 199 // ---------------------------------------------------------------------------- | 241 // ---------------------------------------------------------------------------- |
| 200 // Scanner base-class. | 242 // Scanner base-class. |
| 201 | 243 |
| 202 // Generic functionality used by both JSON and JavaScript scanners. | 244 // Generic functionality used by both JSON and JavaScript scanners. |
| 203 class Scanner { | 245 class Scanner { |
| 204 public: | 246 public: |
| 205 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; | 247 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; |
| 206 | 248 |
| 207 class LiteralScope { | 249 class LiteralScope { |
| 208 public: | 250 public: |
| (...skipping 25 matching lines...) Expand all Loading... |
| 234 // (the token returned by Next()). | 276 // (the token returned by Next()). |
| 235 Location location() const { return current_.location; } | 277 Location location() const { return current_.location; } |
| 236 Location peek_location() const { return next_.location; } | 278 Location peek_location() const { return next_.location; } |
| 237 | 279 |
| 238 // Returns the literal string, if any, for the current token (the | 280 // Returns the literal string, if any, for the current token (the |
| 239 // token returned by Next()). The string is 0-terminated and in | 281 // token returned by Next()). The string is 0-terminated and in |
| 240 // UTF-8 format; they may contain 0-characters. Literal strings are | 282 // UTF-8 format; they may contain 0-characters. Literal strings are |
| 241 // collected for identifiers, strings, and numbers. | 283 // collected for identifiers, strings, and numbers. |
| 242 // These functions only give the correct result if the literal | 284 // These functions only give the correct result if the literal |
| 243 // was scanned between calls to StartLiteral() and TerminateLiteral(). | 285 // was scanned between calls to StartLiteral() and TerminateLiteral(). |
| 244 const char* literal_string() const { | 286 bool is_literal_ascii() { |
| 245 return current_.literal_chars.start(); | 287 ASSERT_NOT_NULL(current_.literal_chars); |
| 288 return current_.literal_chars->is_ascii(); |
| 246 } | 289 } |
| 247 | 290 Vector<const char> literal_ascii_string() { |
| 291 ASSERT_NOT_NULL(current_.literal_chars); |
| 292 return current_.literal_chars->ascii_literal(); |
| 293 } |
| 294 Vector<const uc16> literal_uc16_string() { |
| 295 ASSERT_NOT_NULL(current_.literal_chars); |
| 296 return current_.literal_chars->uc16_literal(); |
| 297 } |
| 248 int literal_length() const { | 298 int literal_length() const { |
| 249 // Excluding terminal '\x00' added by TerminateLiteral(). | 299 ASSERT_NOT_NULL(current_.literal_chars); |
| 250 return current_.literal_chars.length() - 1; | 300 return current_.literal_chars->length(); |
| 251 } | |
| 252 | |
| 253 Vector<const char> literal() const { | |
| 254 return Vector<const char>(literal_string(), literal_length()); | |
| 255 } | 301 } |
| 256 | 302 |
| 257 // Returns the literal string for the next token (the token that | 303 // Returns the literal string for the next token (the token that |
| 258 // would be returned if Next() were called). | 304 // would be returned if Next() were called). |
| 259 const char* next_literal_string() const { | 305 bool is_next_literal_ascii() { |
| 260 return next_.literal_chars.start(); | 306 ASSERT_NOT_NULL(next_.literal_chars); |
| 307 return next_.literal_chars->is_ascii(); |
| 261 } | 308 } |
| 262 | 309 Vector<const char> next_literal_ascii_string() { |
| 263 | 310 ASSERT_NOT_NULL(next_.literal_chars); |
| 264 // Returns the length of the next token (that would be returned if | 311 return next_.literal_chars->ascii_literal(); |
| 265 // Next() were called). | 312 } |
| 313 Vector<const uc16> next_literal_uc16_string() { |
| 314 ASSERT_NOT_NULL(next_.literal_chars); |
| 315 return next_.literal_chars->uc16_literal(); |
| 316 } |
| 266 int next_literal_length() const { | 317 int next_literal_length() const { |
| 267 // Excluding terminal '\x00' added by TerminateLiteral(). | 318 ASSERT_NOT_NULL(next_.literal_chars); |
| 268 return next_.literal_chars.length() - 1; | 319 return next_.literal_chars->length(); |
| 269 } | |
| 270 | |
| 271 Vector<const char> next_literal() const { | |
| 272 return Vector<const char>(next_literal_string(), next_literal_length()); | |
| 273 } | 320 } |
| 274 | 321 |
| 275 static const int kCharacterLookaheadBufferSize = 1; | 322 static const int kCharacterLookaheadBufferSize = 1; |
| 276 | 323 |
| 277 protected: | 324 protected: |
| 278 // The current and look-ahead token. | 325 // The current and look-ahead token. |
| 279 struct TokenDesc { | 326 struct TokenDesc { |
| 280 Token::Value token; | 327 Token::Value token; |
| 281 Location location; | 328 Location location; |
| 282 Vector<const char> literal_chars; | 329 LiteralBuffer* literal_chars; |
| 283 }; | 330 }; |
| 284 | 331 |
| 285 // Call this after setting source_ to the input. | 332 // Call this after setting source_ to the input. |
| 286 void Init() { | 333 void Init() { |
| 287 // Set c0_ (one character ahead) | 334 // Set c0_ (one character ahead) |
| 288 ASSERT(kCharacterLookaheadBufferSize == 1); | 335 ASSERT(kCharacterLookaheadBufferSize == 1); |
| 289 Advance(); | 336 Advance(); |
| 290 // Initialize current_ to not refer to a literal. | 337 // Initialize current_ to not refer to a literal. |
| 291 current_.literal_chars = Vector<const char>(); | 338 current_.literal_chars = NULL; |
| 292 // Reset literal buffer. | |
| 293 literal_buffer_.Reset(); | |
| 294 } | 339 } |
| 295 | 340 |
| 296 // Literal buffer support | 341 // Literal buffer support |
| 297 inline void StartLiteral() { | 342 inline void StartLiteral() { |
| 298 literal_buffer_.StartLiteral(); | 343 LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ? |
| 344 &literal_buffer2_ : &literal_buffer1_; |
| 345 free_buffer->Reset(); |
| 346 next_.literal_chars = free_buffer; |
| 299 } | 347 } |
| 300 | 348 |
| 301 inline void AddLiteralChar(uc32 c) { | 349 inline void AddLiteralChar(uc32 c) { |
| 302 literal_buffer_.AddChar(c); | 350 ASSERT_NOT_NULL(next_.literal_chars); |
| 351 next_.literal_chars->AddChar(c); |
| 303 } | 352 } |
| 304 | 353 |
| 305 // Complete scanning of a literal. | 354 // Complete scanning of a literal. |
| 306 inline void TerminateLiteral() { | 355 inline void TerminateLiteral() { |
| 307 next_.literal_chars = literal_buffer_.EndLiteral(); | 356 // Does nothing in the current implementation. |
| 308 } | 357 } |
| 309 | 358 |
| 310 // Stops scanning of a literal and drop the collected characters, | 359 // Stops scanning of a literal and drop the collected characters, |
| 311 // e.g., due to an encountered error. | 360 // e.g., due to an encountered error. |
| 312 inline void DropLiteral() { | 361 inline void DropLiteral() { |
| 313 literal_buffer_.DropLiteral(); | 362 next_.literal_chars = NULL; |
| 314 } | 363 } |
| 315 | 364 |
| 316 inline void AddLiteralCharAdvance() { | 365 inline void AddLiteralCharAdvance() { |
| 317 AddLiteralChar(c0_); | 366 AddLiteralChar(c0_); |
| 318 Advance(); | 367 Advance(); |
| 319 } | 368 } |
| 320 | 369 |
| 321 // Low-level scanning support. | 370 // Low-level scanning support. |
| 322 void Advance() { c0_ = source_->Advance(); } | 371 void Advance() { c0_ = source_->Advance(); } |
| 323 void PushBack(uc32 ch) { | 372 void PushBack(uc32 ch) { |
| (...skipping 17 matching lines...) Expand all Loading... |
| 341 } | 390 } |
| 342 | 391 |
| 343 uc32 ScanHexEscape(uc32 c, int length); | 392 uc32 ScanHexEscape(uc32 c, int length); |
| 344 uc32 ScanOctalEscape(uc32 c, int length); | 393 uc32 ScanOctalEscape(uc32 c, int length); |
| 345 | 394 |
| 346 // Return the current source position. | 395 // Return the current source position. |
| 347 int source_pos() { | 396 int source_pos() { |
| 348 return source_->pos() - kCharacterLookaheadBufferSize; | 397 return source_->pos() - kCharacterLookaheadBufferSize; |
| 349 } | 398 } |
| 350 | 399 |
| 400 // Buffers collecting literal strings, numbers, etc. |
| 401 LiteralBuffer literal_buffer1_; |
| 402 LiteralBuffer literal_buffer2_; |
| 403 |
| 351 TokenDesc current_; // desc for current token (as returned by Next()) | 404 TokenDesc current_; // desc for current token (as returned by Next()) |
| 352 TokenDesc next_; // desc for next token (one token look-ahead) | 405 TokenDesc next_; // desc for next token (one token look-ahead) |
| 353 | 406 |
| 354 // Input stream. Must be initialized to an UC16CharacterStream. | 407 // Input stream. Must be initialized to an UC16CharacterStream. |
| 355 UC16CharacterStream* source_; | 408 UC16CharacterStream* source_; |
| 356 | 409 |
| 357 // Buffer to hold literal values (identifiers, strings, numbers) | |
| 358 // using '\x00'-terminated UTF-8 encoding. Handles allocation internally. | |
| 359 LiteralCollector literal_buffer_; | |
| 360 | 410 |
| 361 // One Unicode character look-ahead; c0_ < 0 at the end of the input. | 411 // One Unicode character look-ahead; c0_ < 0 at the end of the input. |
| 362 uc32 c0_; | 412 uc32 c0_; |
| 363 }; | 413 }; |
| 364 | 414 |
| 365 // ---------------------------------------------------------------------------- | 415 // ---------------------------------------------------------------------------- |
| 366 // JavaScriptScanner - base logic for JavaScript scanning. | 416 // JavaScriptScanner - base logic for JavaScript scanning. |
| 367 | 417 |
| 368 class JavaScriptScanner : public Scanner { | 418 class JavaScriptScanner : public Scanner { |
| 369 public: | 419 public: |
| 370 | |
| 371 // Bit vector representing set of types of literals. | |
| 372 enum LiteralType { | |
| 373 kNoLiterals = 0, | |
| 374 kLiteralNumber = 1, | |
| 375 kLiteralIdentifier = 2, | |
| 376 kLiteralString = 4, | |
| 377 kLiteralRegExp = 8, | |
| 378 kLiteralRegExpFlags = 16, | |
| 379 kAllLiterals = 31 | |
| 380 }; | |
| 381 | |
| 382 // A LiteralScope that disables recording of some types of JavaScript | 420 // A LiteralScope that disables recording of some types of JavaScript |
| 383 // literals. If the scanner is configured to not record the specific | 421 // literals. If the scanner is configured to not record the specific |
| 384 // type of literal, the scope will not call StartLiteral. | 422 // type of literal, the scope will not call StartLiteral. |
| 385 class LiteralScope { | 423 class LiteralScope { |
| 386 public: | 424 public: |
| 387 LiteralScope(JavaScriptScanner* self, LiteralType type) | 425 explicit LiteralScope(JavaScriptScanner* self) |
| 388 : scanner_(self), complete_(false) { | 426 : scanner_(self), complete_(false) { |
| 389 if (scanner_->RecordsLiteral(type)) { | 427 scanner_->StartLiteral(); |
| 390 scanner_->StartLiteral(); | |
| 391 } | |
| 392 } | 428 } |
| 393 ~LiteralScope() { | 429 ~LiteralScope() { |
| 394 if (!complete_) scanner_->DropLiteral(); | 430 if (!complete_) scanner_->DropLiteral(); |
| 395 } | 431 } |
| 396 void Complete() { | 432 void Complete() { |
| 397 scanner_->TerminateLiteral(); | 433 scanner_->TerminateLiteral(); |
| 398 complete_ = true; | 434 complete_ = true; |
| 399 } | 435 } |
| 400 | 436 |
| 401 private: | 437 private: |
| (...skipping 21 matching lines...) Expand all Loading... |
| 423 // Tells whether the buffer contains an identifier (no escapes). | 459 // Tells whether the buffer contains an identifier (no escapes). |
| 424 // Used for checking if a property name is an identifier. | 460 // Used for checking if a property name is an identifier. |
| 425 static bool IsIdentifier(unibrow::CharacterStream* buffer); | 461 static bool IsIdentifier(unibrow::CharacterStream* buffer); |
| 426 | 462 |
| 427 // Seek forward to the given position. This operation does not | 463 // Seek forward to the given position. This operation does not |
| 428 // work in general, for instance when there are pushed back | 464 // work in general, for instance when there are pushed back |
| 429 // characters, but works for seeking forward until simple delimiter | 465 // characters, but works for seeking forward until simple delimiter |
| 430 // tokens, which is what it is used for. | 466 // tokens, which is what it is used for. |
| 431 void SeekForward(int pos); | 467 void SeekForward(int pos); |
| 432 | 468 |
| 433 // Whether this scanner records the given literal type or not. | |
| 434 bool RecordsLiteral(LiteralType type) { | |
| 435 return (literal_flags_ & type) != 0; | |
| 436 } | |
| 437 | |
| 438 protected: | 469 protected: |
| 439 bool SkipWhiteSpace(); | 470 bool SkipWhiteSpace(); |
| 440 Token::Value SkipSingleLineComment(); | 471 Token::Value SkipSingleLineComment(); |
| 441 Token::Value SkipMultiLineComment(); | 472 Token::Value SkipMultiLineComment(); |
| 442 | 473 |
| 443 // Scans a single JavaScript token. | 474 // Scans a single JavaScript token. |
| 444 void Scan(); | 475 void Scan(); |
| 445 | 476 |
| 446 void ScanDecimalDigits(); | 477 void ScanDecimalDigits(); |
| 447 Token::Value ScanNumber(bool seen_period); | 478 Token::Value ScanNumber(bool seen_period); |
| 448 Token::Value ScanIdentifierOrKeyword(); | 479 Token::Value ScanIdentifierOrKeyword(); |
| 449 Token::Value ScanIdentifierSuffix(LiteralScope* literal); | 480 Token::Value ScanIdentifierSuffix(LiteralScope* literal); |
| 450 | 481 |
| 451 void ScanEscape(); | 482 void ScanEscape(); |
| 452 Token::Value ScanString(); | 483 Token::Value ScanString(); |
| 453 | 484 |
| 454 // Scans a possible HTML comment -- begins with '<!'. | 485 // Scans a possible HTML comment -- begins with '<!'. |
| 455 Token::Value ScanHtmlComment(); | 486 Token::Value ScanHtmlComment(); |
| 456 | 487 |
| 457 // Decodes a unicode escape-sequence which is part of an identifier. | 488 // Decodes a unicode escape-sequence which is part of an identifier. |
| 458 // If the escape sequence cannot be decoded the result is kBadChar. | 489 // If the escape sequence cannot be decoded the result is kBadChar. |
| 459 uc32 ScanIdentifierUnicodeEscape(); | 490 uc32 ScanIdentifierUnicodeEscape(); |
| 460 | 491 |
| 461 int literal_flags_; | |
| 462 bool has_line_terminator_before_next_; | 492 bool has_line_terminator_before_next_; |
| 463 }; | 493 }; |
| 464 | 494 |
| 465 | 495 |
| 466 // ---------------------------------------------------------------------------- | 496 // ---------------------------------------------------------------------------- |
| 467 // Keyword matching state machine. | 497 // Keyword matching state machine. |
| 468 | 498 |
| 469 class KeywordMatcher { | 499 class KeywordMatcher { |
| 470 // Incrementally recognize keywords. | 500 // Incrementally recognize keywords. |
| 471 // | 501 // |
| (...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 584 // keyword with the current prefix). | 614 // keyword with the current prefix). |
| 585 const char* keyword_; | 615 const char* keyword_; |
| 586 int counter_; | 616 int counter_; |
| 587 Token::Value keyword_token_; | 617 Token::Value keyword_token_; |
| 588 }; | 618 }; |
| 589 | 619 |
| 590 | 620 |
| 591 } } // namespace v8::internal | 621 } } // namespace v8::internal |
| 592 | 622 |
| 593 #endif // V8_SCANNER_BASE_H_ | 623 #endif // V8_SCANNER_BASE_H_ |
| OLD | NEW |