OLD | NEW |
1 // Copyright 2010 the V8 project authors. All rights reserved. | 1 // Copyright 2010 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
134 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; | 134 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; |
135 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; | 135 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; |
136 | 136 |
137 static bool IsIdentifier(unibrow::CharacterStream* buffer); | 137 static bool IsIdentifier(unibrow::CharacterStream* buffer); |
138 | 138 |
139 private: | 139 private: |
140 static StaticResource<Utf8Decoder> utf8_decoder_; | 140 static StaticResource<Utf8Decoder> utf8_decoder_; |
141 }; | 141 }; |
142 | 142 |
143 // ---------------------------------------------------------------------------- | 143 // ---------------------------------------------------------------------------- |
144 // LiteralCollector - Collector of chars of literals. | 144 // LiteralBuffer - Collector of chars of literals. |
145 | 145 |
146 class LiteralCollector { | 146 class LiteralBuffer { |
147 public: | 147 public: |
148 LiteralCollector(); | 148 LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { } |
149 ~LiteralCollector(); | |
150 | 149 |
151 inline void AddChar(uc32 c) { | 150 ~LiteralBuffer() { |
152 if (recording_) { | 151 if (backing_store_.length() > 0) { |
153 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { | 152 backing_store_.Dispose(); |
154 buffer_.Add(static_cast<char>(c)); | |
155 } else { | |
156 AddCharSlow(c); | |
157 } | |
158 } | 153 } |
159 } | 154 } |
160 | 155 |
161 void StartLiteral() { | 156 inline void AddChar(uc16 character) { |
162 buffer_.StartSequence(); | 157 if (position_ >= backing_store_.length()) ExpandBuffer(); |
163 recording_ = true; | 158 if (is_ascii_) { |
| 159 if (character < kMaxAsciiCharCodeU) { |
| 160 backing_store_[position_] = static_cast<byte>(character); |
| 161 position_ += kASCIISize; |
| 162 return; |
| 163 } |
| 164 ConvertToUC16(); |
| 165 } |
| 166 *reinterpret_cast<uc16*>(&backing_store_[position_]) = character; |
| 167 position_ += kUC16Size; |
164 } | 168 } |
165 | 169 |
166 Vector<const char> EndLiteral() { | 170 bool is_ascii() { return is_ascii_; } |
167 if (recording_) { | 171 |
168 recording_ = false; | 172 Vector<const uc16> uc16_literal() { |
169 buffer_.Add(kEndMarker); | 173 ASSERT(!is_ascii_); |
170 Vector<char> sequence = buffer_.EndSequence(); | 174 ASSERT((position_ & 0x1) == 0); |
171 return Vector<const char>(sequence.start(), sequence.length()); | 175 return Vector<const uc16>( |
172 } | 176 reinterpret_cast<const uc16*>(backing_store_.start()), |
173 return Vector<const char>(); | 177 position_ >> 1); |
174 } | 178 } |
175 | 179 |
176 void DropLiteral() { | 180 Vector<const char> ascii_literal() { |
177 if (recording_) { | 181 ASSERT(is_ascii_); |
178 recording_ = false; | 182 return Vector<const char>( |
179 buffer_.DropSequence(); | 183 reinterpret_cast<const char*>(backing_store_.start()), |
180 } | 184 position_); |
| 185 } |
| 186 |
| 187 int length() { |
| 188 return is_ascii_ ? position_ : (position_ >> 1); |
181 } | 189 } |
182 | 190 |
183 void Reset() { | 191 void Reset() { |
184 buffer_.Reset(); | 192 position_ = 0; |
| 193 is_ascii_ = true; |
| 194 } |
| 195 private: |
| 196 static const int kInitialCapacity = 16; |
| 197 static const int kGrowthFactory = 4; |
| 198 static const int kMinConversionSlack = 256; |
| 199 static const int kMaxGrowth = 1 * MB; |
| 200 inline int NewCapacity(int min_capacity) { |
| 201 int capacity = Max(min_capacity, backing_store_.length()); |
| 202 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth); |
| 203 return new_capacity; |
185 } | 204 } |
186 | 205 |
187 // The end marker added after a parsed literal. | 206 void ExpandBuffer() { |
188 // Using zero allows the usage of strlen and similar functions on | 207 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity)); |
189 // identifiers and numbers (but not strings, since they may contain zero | 208 memcpy(new_store.start(), backing_store_.start(), position_); |
190 // bytes). | 209 backing_store_.Dispose(); |
191 static const char kEndMarker = '\x00'; | 210 backing_store_ = new_store; |
192 private: | 211 } |
193 static const int kInitialCapacity = 256; | 212 |
194 SequenceCollector<char, 4> buffer_; | 213 void ConvertToUC16() { |
195 bool recording_; | 214 ASSERT(is_ascii_); |
196 void AddCharSlow(uc32 c); | 215 Vector<byte> new_store; |
| 216 int new_content_size = position_ * kUC16Size; |
| 217 if (new_content_size > backing_store_.length()) { |
| 218 new_store = Vector<byte>::New(NewCapacity(new_content_size)); |
| 219 } else { |
| 220 new_store = backing_store_; |
| 221 } |
| 222 char* src = reinterpret_cast<char*>(backing_store_.start()); |
| 223 uc16* dst = reinterpret_cast<uc16*>(new_store.start()); |
| 224 for (int i = position_ - 1; i >= 0; i--) { |
| 225 dst[i] = src[i]; |
| 226 } |
| 227 if (new_store.start() != backing_store_.start()) { |
| 228 backing_store_.Dispose(); |
| 229 backing_store_ = new_store; |
| 230 } |
| 231 position_ = new_content_size; |
| 232 is_ascii_ = false; |
| 233 } |
| 234 |
| 235 bool is_ascii_; |
| 236 int position_; |
| 237 Vector<byte> backing_store_; |
197 }; | 238 }; |
198 | 239 |
| 240 |
199 // ---------------------------------------------------------------------------- | 241 // ---------------------------------------------------------------------------- |
200 // Scanner base-class. | 242 // Scanner base-class. |
201 | 243 |
202 // Generic functionality used by both JSON and JavaScript scanners. | 244 // Generic functionality used by both JSON and JavaScript scanners. |
203 class Scanner { | 245 class Scanner { |
204 public: | 246 public: |
205 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; | 247 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; |
206 | 248 |
207 class LiteralScope { | 249 class LiteralScope { |
208 public: | 250 public: |
(...skipping 25 matching lines...) Expand all Loading... |
234 // (the token returned by Next()). | 276 // (the token returned by Next()). |
235 Location location() const { return current_.location; } | 277 Location location() const { return current_.location; } |
236 Location peek_location() const { return next_.location; } | 278 Location peek_location() const { return next_.location; } |
237 | 279 |
238 // Returns the literal string, if any, for the current token (the | 280 // Returns the literal string, if any, for the current token (the |
239 // token returned by Next()). The string is 0-terminated and in | 281 // token returned by Next()). The string is 0-terminated and in |
240 // UTF-8 format; they may contain 0-characters. Literal strings are | 282 // UTF-8 format; they may contain 0-characters. Literal strings are |
241 // collected for identifiers, strings, and numbers. | 283 // collected for identifiers, strings, and numbers. |
242 // These functions only give the correct result if the literal | 284 // These functions only give the correct result if the literal |
243 // was scanned between calls to StartLiteral() and TerminateLiteral(). | 285 // was scanned between calls to StartLiteral() and TerminateLiteral(). |
244 const char* literal_string() const { | 286 bool is_literal_ascii() { |
245 return current_.literal_chars.start(); | 287 ASSERT_NOT_NULL(current_.literal_chars); |
| 288 return current_.literal_chars->is_ascii(); |
246 } | 289 } |
247 | 290 Vector<const char> literal_ascii_string() { |
| 291 ASSERT_NOT_NULL(current_.literal_chars); |
| 292 return current_.literal_chars->ascii_literal(); |
| 293 } |
| 294 Vector<const uc16> literal_uc16_string() { |
| 295 ASSERT_NOT_NULL(current_.literal_chars); |
| 296 return current_.literal_chars->uc16_literal(); |
| 297 } |
248 int literal_length() const { | 298 int literal_length() const { |
249 // Excluding terminal '\x00' added by TerminateLiteral(). | 299 ASSERT_NOT_NULL(current_.literal_chars); |
250 return current_.literal_chars.length() - 1; | 300 return current_.literal_chars->length(); |
251 } | |
252 | |
253 Vector<const char> literal() const { | |
254 return Vector<const char>(literal_string(), literal_length()); | |
255 } | 301 } |
256 | 302 |
257 // Returns the literal string for the next token (the token that | 303 // Returns the literal string for the next token (the token that |
258 // would be returned if Next() were called). | 304 // would be returned if Next() were called). |
259 const char* next_literal_string() const { | 305 bool is_next_literal_ascii() { |
260 return next_.literal_chars.start(); | 306 ASSERT_NOT_NULL(next_.literal_chars); |
| 307 return next_.literal_chars->is_ascii(); |
261 } | 308 } |
262 | 309 Vector<const char> next_literal_ascii_string() { |
263 | 310 ASSERT_NOT_NULL(next_.literal_chars); |
264 // Returns the length of the next token (that would be returned if | 311 return next_.literal_chars->ascii_literal(); |
265 // Next() were called). | 312 } |
| 313 Vector<const uc16> next_literal_uc16_string() { |
| 314 ASSERT_NOT_NULL(next_.literal_chars); |
| 315 return next_.literal_chars->uc16_literal(); |
| 316 } |
266 int next_literal_length() const { | 317 int next_literal_length() const { |
267 // Excluding terminal '\x00' added by TerminateLiteral(). | 318 ASSERT_NOT_NULL(next_.literal_chars); |
268 return next_.literal_chars.length() - 1; | 319 return next_.literal_chars->length(); |
269 } | |
270 | |
271 Vector<const char> next_literal() const { | |
272 return Vector<const char>(next_literal_string(), next_literal_length()); | |
273 } | 320 } |
274 | 321 |
275 static const int kCharacterLookaheadBufferSize = 1; | 322 static const int kCharacterLookaheadBufferSize = 1; |
276 | 323 |
277 protected: | 324 protected: |
278 // The current and look-ahead token. | 325 // The current and look-ahead token. |
279 struct TokenDesc { | 326 struct TokenDesc { |
280 Token::Value token; | 327 Token::Value token; |
281 Location location; | 328 Location location; |
282 Vector<const char> literal_chars; | 329 LiteralBuffer* literal_chars; |
283 }; | 330 }; |
284 | 331 |
285 // Call this after setting source_ to the input. | 332 // Call this after setting source_ to the input. |
286 void Init() { | 333 void Init() { |
287 // Set c0_ (one character ahead) | 334 // Set c0_ (one character ahead) |
288 ASSERT(kCharacterLookaheadBufferSize == 1); | 335 ASSERT(kCharacterLookaheadBufferSize == 1); |
289 Advance(); | 336 Advance(); |
290 // Initialize current_ to not refer to a literal. | 337 // Initialize current_ to not refer to a literal. |
291 current_.literal_chars = Vector<const char>(); | 338 current_.literal_chars = NULL; |
292 // Reset literal buffer. | |
293 literal_buffer_.Reset(); | |
294 } | 339 } |
295 | 340 |
296 // Literal buffer support | 341 // Literal buffer support |
297 inline void StartLiteral() { | 342 inline void StartLiteral() { |
298 literal_buffer_.StartLiteral(); | 343 LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ? |
| 344 &literal_buffer2_ : &literal_buffer1_; |
| 345 free_buffer->Reset(); |
| 346 next_.literal_chars = free_buffer; |
299 } | 347 } |
300 | 348 |
301 inline void AddLiteralChar(uc32 c) { | 349 inline void AddLiteralChar(uc32 c) { |
302 literal_buffer_.AddChar(c); | 350 ASSERT_NOT_NULL(next_.literal_chars); |
| 351 next_.literal_chars->AddChar(c); |
303 } | 352 } |
304 | 353 |
305 // Complete scanning of a literal. | 354 // Complete scanning of a literal. |
306 inline void TerminateLiteral() { | 355 inline void TerminateLiteral() { |
307 next_.literal_chars = literal_buffer_.EndLiteral(); | 356 // Does nothing in the current implementation. |
308 } | 357 } |
309 | 358 |
310 // Stops scanning of a literal and drop the collected characters, | 359 // Stops scanning of a literal and drop the collected characters, |
311 // e.g., due to an encountered error. | 360 // e.g., due to an encountered error. |
312 inline void DropLiteral() { | 361 inline void DropLiteral() { |
313 literal_buffer_.DropLiteral(); | 362 next_.literal_chars = NULL; |
314 } | 363 } |
315 | 364 |
316 inline void AddLiteralCharAdvance() { | 365 inline void AddLiteralCharAdvance() { |
317 AddLiteralChar(c0_); | 366 AddLiteralChar(c0_); |
318 Advance(); | 367 Advance(); |
319 } | 368 } |
320 | 369 |
321 // Low-level scanning support. | 370 // Low-level scanning support. |
322 void Advance() { c0_ = source_->Advance(); } | 371 void Advance() { c0_ = source_->Advance(); } |
323 void PushBack(uc32 ch) { | 372 void PushBack(uc32 ch) { |
(...skipping 17 matching lines...) Expand all Loading... |
341 } | 390 } |
342 | 391 |
343 uc32 ScanHexEscape(uc32 c, int length); | 392 uc32 ScanHexEscape(uc32 c, int length); |
344 uc32 ScanOctalEscape(uc32 c, int length); | 393 uc32 ScanOctalEscape(uc32 c, int length); |
345 | 394 |
346 // Return the current source position. | 395 // Return the current source position. |
347 int source_pos() { | 396 int source_pos() { |
348 return source_->pos() - kCharacterLookaheadBufferSize; | 397 return source_->pos() - kCharacterLookaheadBufferSize; |
349 } | 398 } |
350 | 399 |
| 400 // Buffers collecting literal strings, numbers, etc. |
| 401 LiteralBuffer literal_buffer1_; |
| 402 LiteralBuffer literal_buffer2_; |
| 403 |
351 TokenDesc current_; // desc for current token (as returned by Next()) | 404 TokenDesc current_; // desc for current token (as returned by Next()) |
352 TokenDesc next_; // desc for next token (one token look-ahead) | 405 TokenDesc next_; // desc for next token (one token look-ahead) |
353 | 406 |
354 // Input stream. Must be initialized to an UC16CharacterStream. | 407 // Input stream. Must be initialized to an UC16CharacterStream. |
355 UC16CharacterStream* source_; | 408 UC16CharacterStream* source_; |
356 | 409 |
357 // Buffer to hold literal values (identifiers, strings, numbers) | |
358 // using '\x00'-terminated UTF-8 encoding. Handles allocation internally. | |
359 LiteralCollector literal_buffer_; | |
360 | 410 |
361 // One Unicode character look-ahead; c0_ < 0 at the end of the input. | 411 // One Unicode character look-ahead; c0_ < 0 at the end of the input. |
362 uc32 c0_; | 412 uc32 c0_; |
363 }; | 413 }; |
364 | 414 |
365 // ---------------------------------------------------------------------------- | 415 // ---------------------------------------------------------------------------- |
366 // JavaScriptScanner - base logic for JavaScript scanning. | 416 // JavaScriptScanner - base logic for JavaScript scanning. |
367 | 417 |
368 class JavaScriptScanner : public Scanner { | 418 class JavaScriptScanner : public Scanner { |
369 public: | 419 public: |
370 | |
371 // Bit vector representing set of types of literals. | |
372 enum LiteralType { | |
373 kNoLiterals = 0, | |
374 kLiteralNumber = 1, | |
375 kLiteralIdentifier = 2, | |
376 kLiteralString = 4, | |
377 kLiteralRegExp = 8, | |
378 kLiteralRegExpFlags = 16, | |
379 kAllLiterals = 31 | |
380 }; | |
381 | |
382 // A LiteralScope that disables recording of some types of JavaScript | 420 // A LiteralScope that disables recording of some types of JavaScript |
383 // literals. If the scanner is configured to not record the specific | 421 // literals. If the scanner is configured to not record the specific |
384 // type of literal, the scope will not call StartLiteral. | 422 // type of literal, the scope will not call StartLiteral. |
385 class LiteralScope { | 423 class LiteralScope { |
386 public: | 424 public: |
387 LiteralScope(JavaScriptScanner* self, LiteralType type) | 425 explicit LiteralScope(JavaScriptScanner* self) |
388 : scanner_(self), complete_(false) { | 426 : scanner_(self), complete_(false) { |
389 if (scanner_->RecordsLiteral(type)) { | 427 scanner_->StartLiteral(); |
390 scanner_->StartLiteral(); | |
391 } | |
392 } | 428 } |
393 ~LiteralScope() { | 429 ~LiteralScope() { |
394 if (!complete_) scanner_->DropLiteral(); | 430 if (!complete_) scanner_->DropLiteral(); |
395 } | 431 } |
396 void Complete() { | 432 void Complete() { |
397 scanner_->TerminateLiteral(); | 433 scanner_->TerminateLiteral(); |
398 complete_ = true; | 434 complete_ = true; |
399 } | 435 } |
400 | 436 |
401 private: | 437 private: |
(...skipping 21 matching lines...) Expand all Loading... |
423 // Tells whether the buffer contains an identifier (no escapes). | 459 // Tells whether the buffer contains an identifier (no escapes). |
424 // Used for checking if a property name is an identifier. | 460 // Used for checking if a property name is an identifier. |
425 static bool IsIdentifier(unibrow::CharacterStream* buffer); | 461 static bool IsIdentifier(unibrow::CharacterStream* buffer); |
426 | 462 |
427 // Seek forward to the given position. This operation does not | 463 // Seek forward to the given position. This operation does not |
428 // work in general, for instance when there are pushed back | 464 // work in general, for instance when there are pushed back |
429 // characters, but works for seeking forward until simple delimiter | 465 // characters, but works for seeking forward until simple delimiter |
430 // tokens, which is what it is used for. | 466 // tokens, which is what it is used for. |
431 void SeekForward(int pos); | 467 void SeekForward(int pos); |
432 | 468 |
433 // Whether this scanner records the given literal type or not. | |
434 bool RecordsLiteral(LiteralType type) { | |
435 return (literal_flags_ & type) != 0; | |
436 } | |
437 | |
438 protected: | 469 protected: |
439 bool SkipWhiteSpace(); | 470 bool SkipWhiteSpace(); |
440 Token::Value SkipSingleLineComment(); | 471 Token::Value SkipSingleLineComment(); |
441 Token::Value SkipMultiLineComment(); | 472 Token::Value SkipMultiLineComment(); |
442 | 473 |
443 // Scans a single JavaScript token. | 474 // Scans a single JavaScript token. |
444 void Scan(); | 475 void Scan(); |
445 | 476 |
446 void ScanDecimalDigits(); | 477 void ScanDecimalDigits(); |
447 Token::Value ScanNumber(bool seen_period); | 478 Token::Value ScanNumber(bool seen_period); |
448 Token::Value ScanIdentifierOrKeyword(); | 479 Token::Value ScanIdentifierOrKeyword(); |
449 Token::Value ScanIdentifierSuffix(LiteralScope* literal); | 480 Token::Value ScanIdentifierSuffix(LiteralScope* literal); |
450 | 481 |
451 void ScanEscape(); | 482 void ScanEscape(); |
452 Token::Value ScanString(); | 483 Token::Value ScanString(); |
453 | 484 |
454 // Scans a possible HTML comment -- begins with '<!'. | 485 // Scans a possible HTML comment -- begins with '<!'. |
455 Token::Value ScanHtmlComment(); | 486 Token::Value ScanHtmlComment(); |
456 | 487 |
457 // Decodes a unicode escape-sequence which is part of an identifier. | 488 // Decodes a unicode escape-sequence which is part of an identifier. |
458 // If the escape sequence cannot be decoded the result is kBadChar. | 489 // If the escape sequence cannot be decoded the result is kBadChar. |
459 uc32 ScanIdentifierUnicodeEscape(); | 490 uc32 ScanIdentifierUnicodeEscape(); |
460 | 491 |
461 int literal_flags_; | |
462 bool has_line_terminator_before_next_; | 492 bool has_line_terminator_before_next_; |
463 }; | 493 }; |
464 | 494 |
465 | 495 |
466 // ---------------------------------------------------------------------------- | 496 // ---------------------------------------------------------------------------- |
467 // Keyword matching state machine. | 497 // Keyword matching state machine. |
468 | 498 |
469 class KeywordMatcher { | 499 class KeywordMatcher { |
470 // Incrementally recognize keywords. | 500 // Incrementally recognize keywords. |
471 // | 501 // |
(...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
584 // keyword with the current prefix). | 614 // keyword with the current prefix). |
585 const char* keyword_; | 615 const char* keyword_; |
586 int counter_; | 616 int counter_; |
587 Token::Value keyword_token_; | 617 Token::Value keyword_token_; |
588 }; | 618 }; |
589 | 619 |
590 | 620 |
591 } } // namespace v8::internal | 621 } } // namespace v8::internal |
592 | 622 |
593 #endif // V8_SCANNER_BASE_H_ | 623 #endif // V8_SCANNER_BASE_H_ |
OLD | NEW |