OLD | NEW |
| (Empty) |
1 // Copyright 2011 the V8 project authors. All rights reserved. | |
2 // Redistribution and use in source and binary forms, with or without | |
3 // modification, are permitted provided that the following conditions are | |
4 // met: | |
5 // | |
6 // * Redistributions of source code must retain the above copyright | |
7 // notice, this list of conditions and the following disclaimer. | |
8 // * Redistributions in binary form must reproduce the above | |
9 // copyright notice, this list of conditions and the following | |
10 // disclaimer in the documentation and/or other materials provided | |
11 // with the distribution. | |
12 // * Neither the name of Google Inc. nor the names of its | |
13 // contributors may be used to endorse or promote products derived | |
14 // from this software without specific prior written permission. | |
15 // | |
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
27 | |
28 // Features shared by parsing and pre-parsing scanners. | |
29 | |
30 #ifndef V8_SCANNER_BASE_H_ | |
31 #define V8_SCANNER_BASE_H_ | |
32 | |
33 #include "allocation.h" | |
34 #include "char-predicates.h" | |
35 #include "checks.h" | |
36 #include "globals.h" | |
37 #include "token.h" | |
38 #include "unicode-inl.h" | |
39 #include "utils.h" | |
40 | |
41 namespace v8 { | |
42 namespace internal { | |
43 | |
44 // Returns the value (0 .. 15) of a hexadecimal character c. | |
45 // If c is not a legal hexadecimal character, returns a value < 0. | |
46 inline int HexValue(uc32 c) { | |
47 c -= '0'; | |
48 if (static_cast<unsigned>(c) <= 9) return c; | |
49 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36. | |
50 if (static_cast<unsigned>(c) <= 5) return c + 10; | |
51 return -1; | |
52 } | |
53 | |
54 | |
55 // --------------------------------------------------------------------- | |
56 // Buffered stream of characters, using an internal UC16 buffer. | |
57 | |
58 class UC16CharacterStream { | |
59 public: | |
60 UC16CharacterStream() : pos_(0) { } | |
61 virtual ~UC16CharacterStream() { } | |
62 | |
63 // Returns and advances past the next UC16 character in the input | |
64 // stream. If there are no more characters, it returns a negative | |
65 // value. | |
66 inline uc32 Advance() { | |
67 if (buffer_cursor_ < buffer_end_ || ReadBlock()) { | |
68 pos_++; | |
69 return static_cast<uc32>(*(buffer_cursor_++)); | |
70 } | |
71 // Note: currently the following increment is necessary to avoid a | |
72 // parser problem! The scanner treats the final kEndOfInput as | |
73 // a character with a position, and does math relative to that | |
74 // position. | |
75 pos_++; | |
76 | |
77 return kEndOfInput; | |
78 } | |
79 | |
80 // Return the current position in the character stream. | |
81 // Starts at zero. | |
82 inline unsigned pos() const { return pos_; } | |
83 | |
84 // Skips forward past the next character_count UC16 characters | |
85 // in the input, or until the end of input if that comes sooner. | |
86 // Returns the number of characters actually skipped. If less | |
87 // than character_count, | |
88 inline unsigned SeekForward(unsigned character_count) { | |
89 unsigned buffered_chars = | |
90 static_cast<unsigned>(buffer_end_ - buffer_cursor_); | |
91 if (character_count <= buffered_chars) { | |
92 buffer_cursor_ += character_count; | |
93 pos_ += character_count; | |
94 return character_count; | |
95 } | |
96 return SlowSeekForward(character_count); | |
97 } | |
98 | |
99 // Pushes back the most recently read UC16 character (or negative | |
100 // value if at end of input), i.e., the value returned by the most recent | |
101 // call to Advance. | |
102 // Must not be used right after calling SeekForward. | |
103 virtual void PushBack(int32_t character) = 0; | |
104 | |
105 protected: | |
106 static const uc32 kEndOfInput = -1; | |
107 | |
108 // Ensures that the buffer_cursor_ points to the character at | |
109 // position pos_ of the input, if possible. If the position | |
110 // is at or after the end of the input, return false. If there | |
111 // are more characters available, return true. | |
112 virtual bool ReadBlock() = 0; | |
113 virtual unsigned SlowSeekForward(unsigned character_count) = 0; | |
114 | |
115 const uc16* buffer_cursor_; | |
116 const uc16* buffer_end_; | |
117 unsigned pos_; | |
118 }; | |
119 | |
120 | |
121 class UnicodeCache { | |
122 // --------------------------------------------------------------------- | |
123 // Caching predicates used by scanners. | |
124 public: | |
125 UnicodeCache() {} | |
126 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; | |
127 | |
128 StaticResource<Utf8Decoder>* utf8_decoder() { | |
129 return &utf8_decoder_; | |
130 } | |
131 | |
132 bool IsIdentifierStart(unibrow::uchar c) { return kIsIdentifierStart.get(c); } | |
133 bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); } | |
134 bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); } | |
135 bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); } | |
136 | |
137 private: | |
138 | |
139 unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart; | |
140 unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; | |
141 unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; | |
142 unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; | |
143 StaticResource<Utf8Decoder> utf8_decoder_; | |
144 | |
145 DISALLOW_COPY_AND_ASSIGN(UnicodeCache); | |
146 }; | |
147 | |
148 | |
149 // ---------------------------------------------------------------------------- | |
150 // LiteralBuffer - Collector of chars of literals. | |
151 | |
152 class LiteralBuffer { | |
153 public: | |
154 LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { } | |
155 | |
156 ~LiteralBuffer() { | |
157 if (backing_store_.length() > 0) { | |
158 backing_store_.Dispose(); | |
159 } | |
160 } | |
161 | |
162 inline void AddChar(uc16 character) { | |
163 if (position_ >= backing_store_.length()) ExpandBuffer(); | |
164 if (is_ascii_) { | |
165 if (character < kMaxAsciiCharCodeU) { | |
166 backing_store_[position_] = static_cast<byte>(character); | |
167 position_ += kASCIISize; | |
168 return; | |
169 } | |
170 ConvertToUC16(); | |
171 } | |
172 *reinterpret_cast<uc16*>(&backing_store_[position_]) = character; | |
173 position_ += kUC16Size; | |
174 } | |
175 | |
176 bool is_ascii() { return is_ascii_; } | |
177 | |
178 Vector<const uc16> uc16_literal() { | |
179 ASSERT(!is_ascii_); | |
180 ASSERT((position_ & 0x1) == 0); | |
181 return Vector<const uc16>( | |
182 reinterpret_cast<const uc16*>(backing_store_.start()), | |
183 position_ >> 1); | |
184 } | |
185 | |
186 Vector<const char> ascii_literal() { | |
187 ASSERT(is_ascii_); | |
188 return Vector<const char>( | |
189 reinterpret_cast<const char*>(backing_store_.start()), | |
190 position_); | |
191 } | |
192 | |
193 int length() { | |
194 return is_ascii_ ? position_ : (position_ >> 1); | |
195 } | |
196 | |
197 void Reset() { | |
198 position_ = 0; | |
199 is_ascii_ = true; | |
200 } | |
201 private: | |
202 static const int kInitialCapacity = 16; | |
203 static const int kGrowthFactory = 4; | |
204 static const int kMinConversionSlack = 256; | |
205 static const int kMaxGrowth = 1 * MB; | |
206 inline int NewCapacity(int min_capacity) { | |
207 int capacity = Max(min_capacity, backing_store_.length()); | |
208 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth); | |
209 return new_capacity; | |
210 } | |
211 | |
212 void ExpandBuffer() { | |
213 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity)); | |
214 memcpy(new_store.start(), backing_store_.start(), position_); | |
215 backing_store_.Dispose(); | |
216 backing_store_ = new_store; | |
217 } | |
218 | |
219 void ConvertToUC16() { | |
220 ASSERT(is_ascii_); | |
221 Vector<byte> new_store; | |
222 int new_content_size = position_ * kUC16Size; | |
223 if (new_content_size >= backing_store_.length()) { | |
224 // Ensure room for all currently read characters as UC16 as well | |
225 // as the character about to be stored. | |
226 new_store = Vector<byte>::New(NewCapacity(new_content_size)); | |
227 } else { | |
228 new_store = backing_store_; | |
229 } | |
230 char* src = reinterpret_cast<char*>(backing_store_.start()); | |
231 uc16* dst = reinterpret_cast<uc16*>(new_store.start()); | |
232 for (int i = position_ - 1; i >= 0; i--) { | |
233 dst[i] = src[i]; | |
234 } | |
235 if (new_store.start() != backing_store_.start()) { | |
236 backing_store_.Dispose(); | |
237 backing_store_ = new_store; | |
238 } | |
239 position_ = new_content_size; | |
240 is_ascii_ = false; | |
241 } | |
242 | |
243 bool is_ascii_; | |
244 int position_; | |
245 Vector<byte> backing_store_; | |
246 | |
247 DISALLOW_COPY_AND_ASSIGN(LiteralBuffer); | |
248 }; | |
249 | |
250 | |
251 // ---------------------------------------------------------------------------- | |
252 // Scanner base-class. | |
253 | |
254 // Generic functionality used by both JSON and JavaScript scanners. | |
255 class Scanner { | |
256 public: | |
257 // -1 is outside of the range of any real source code. | |
258 static const int kNoOctalLocation = -1; | |
259 | |
260 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; | |
261 | |
262 class LiteralScope { | |
263 public: | |
264 explicit LiteralScope(Scanner* self); | |
265 ~LiteralScope(); | |
266 void Complete(); | |
267 | |
268 private: | |
269 Scanner* scanner_; | |
270 bool complete_; | |
271 }; | |
272 | |
273 explicit Scanner(UnicodeCache* scanner_contants); | |
274 | |
275 // Returns the current token again. | |
276 Token::Value current_token() { return current_.token; } | |
277 | |
278 // One token look-ahead (past the token returned by Next()). | |
279 Token::Value peek() const { return next_.token; } | |
280 | |
281 struct Location { | |
282 Location(int b, int e) : beg_pos(b), end_pos(e) { } | |
283 Location() : beg_pos(0), end_pos(0) { } | |
284 | |
285 bool IsValid() const { | |
286 return beg_pos >= 0 && end_pos >= beg_pos; | |
287 } | |
288 | |
289 static Location invalid() { return Location(-1, -1); } | |
290 | |
291 int beg_pos; | |
292 int end_pos; | |
293 }; | |
294 | |
295 // Returns the location information for the current token | |
296 // (the token returned by Next()). | |
297 Location location() const { return current_.location; } | |
298 Location peek_location() const { return next_.location; } | |
299 | |
300 // Returns the literal string, if any, for the current token (the | |
301 // token returned by Next()). The string is 0-terminated and in | |
302 // UTF-8 format; they may contain 0-characters. Literal strings are | |
303 // collected for identifiers, strings, and numbers. | |
304 // These functions only give the correct result if the literal | |
305 // was scanned between calls to StartLiteral() and TerminateLiteral(). | |
306 bool is_literal_ascii() { | |
307 ASSERT_NOT_NULL(current_.literal_chars); | |
308 return current_.literal_chars->is_ascii(); | |
309 } | |
310 Vector<const char> literal_ascii_string() { | |
311 ASSERT_NOT_NULL(current_.literal_chars); | |
312 return current_.literal_chars->ascii_literal(); | |
313 } | |
314 Vector<const uc16> literal_uc16_string() { | |
315 ASSERT_NOT_NULL(current_.literal_chars); | |
316 return current_.literal_chars->uc16_literal(); | |
317 } | |
318 int literal_length() const { | |
319 ASSERT_NOT_NULL(current_.literal_chars); | |
320 return current_.literal_chars->length(); | |
321 } | |
322 | |
323 bool literal_contains_escapes() const { | |
324 Location location = current_.location; | |
325 int source_length = (location.end_pos - location.beg_pos); | |
326 if (current_.token == Token::STRING) { | |
327 // Subtract delimiters. | |
328 source_length -= 2; | |
329 } | |
330 return current_.literal_chars->length() != source_length; | |
331 } | |
332 | |
333 // Returns the literal string for the next token (the token that | |
334 // would be returned if Next() were called). | |
335 bool is_next_literal_ascii() { | |
336 ASSERT_NOT_NULL(next_.literal_chars); | |
337 return next_.literal_chars->is_ascii(); | |
338 } | |
339 Vector<const char> next_literal_ascii_string() { | |
340 ASSERT_NOT_NULL(next_.literal_chars); | |
341 return next_.literal_chars->ascii_literal(); | |
342 } | |
343 Vector<const uc16> next_literal_uc16_string() { | |
344 ASSERT_NOT_NULL(next_.literal_chars); | |
345 return next_.literal_chars->uc16_literal(); | |
346 } | |
347 int next_literal_length() const { | |
348 ASSERT_NOT_NULL(next_.literal_chars); | |
349 return next_.literal_chars->length(); | |
350 } | |
351 | |
352 UnicodeCache* unicode_cache() { return unicode_cache_; } | |
353 | |
354 static const int kCharacterLookaheadBufferSize = 1; | |
355 | |
356 protected: | |
357 // The current and look-ahead token. | |
358 struct TokenDesc { | |
359 Token::Value token; | |
360 Location location; | |
361 LiteralBuffer* literal_chars; | |
362 }; | |
363 | |
364 // Call this after setting source_ to the input. | |
365 void Init() { | |
366 // Set c0_ (one character ahead) | |
367 STATIC_ASSERT(kCharacterLookaheadBufferSize == 1); | |
368 Advance(); | |
369 // Initialize current_ to not refer to a literal. | |
370 current_.literal_chars = NULL; | |
371 } | |
372 | |
373 // Literal buffer support | |
374 inline void StartLiteral() { | |
375 LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ? | |
376 &literal_buffer2_ : &literal_buffer1_; | |
377 free_buffer->Reset(); | |
378 next_.literal_chars = free_buffer; | |
379 } | |
380 | |
381 inline void AddLiteralChar(uc32 c) { | |
382 ASSERT_NOT_NULL(next_.literal_chars); | |
383 next_.literal_chars->AddChar(c); | |
384 } | |
385 | |
386 // Complete scanning of a literal. | |
387 inline void TerminateLiteral() { | |
388 // Does nothing in the current implementation. | |
389 } | |
390 | |
391 // Stops scanning of a literal and drop the collected characters, | |
392 // e.g., due to an encountered error. | |
393 inline void DropLiteral() { | |
394 next_.literal_chars = NULL; | |
395 } | |
396 | |
397 inline void AddLiteralCharAdvance() { | |
398 AddLiteralChar(c0_); | |
399 Advance(); | |
400 } | |
401 | |
402 // Low-level scanning support. | |
403 void Advance() { c0_ = source_->Advance(); } | |
404 void PushBack(uc32 ch) { | |
405 source_->PushBack(c0_); | |
406 c0_ = ch; | |
407 } | |
408 | |
409 inline Token::Value Select(Token::Value tok) { | |
410 Advance(); | |
411 return tok; | |
412 } | |
413 | |
414 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) { | |
415 Advance(); | |
416 if (c0_ == next) { | |
417 Advance(); | |
418 return then; | |
419 } else { | |
420 return else_; | |
421 } | |
422 } | |
423 | |
424 uc32 ScanHexNumber(int expected_length); | |
425 | |
426 // Return the current source position. | |
427 int source_pos() { | |
428 return source_->pos() - kCharacterLookaheadBufferSize; | |
429 } | |
430 | |
431 UnicodeCache* unicode_cache_; | |
432 | |
433 // Buffers collecting literal strings, numbers, etc. | |
434 LiteralBuffer literal_buffer1_; | |
435 LiteralBuffer literal_buffer2_; | |
436 | |
437 TokenDesc current_; // desc for current token (as returned by Next()) | |
438 TokenDesc next_; // desc for next token (one token look-ahead) | |
439 | |
440 // Input stream. Must be initialized to an UC16CharacterStream. | |
441 UC16CharacterStream* source_; | |
442 | |
443 // One Unicode character look-ahead; c0_ < 0 at the end of the input. | |
444 uc32 c0_; | |
445 }; | |
446 | |
447 // ---------------------------------------------------------------------------- | |
448 // JavaScriptScanner - base logic for JavaScript scanning. | |
449 | |
450 class JavaScriptScanner : public Scanner { | |
451 public: | |
452 // A LiteralScope that disables recording of some types of JavaScript | |
453 // literals. If the scanner is configured to not record the specific | |
454 // type of literal, the scope will not call StartLiteral. | |
455 class LiteralScope { | |
456 public: | |
457 explicit LiteralScope(JavaScriptScanner* self) | |
458 : scanner_(self), complete_(false) { | |
459 scanner_->StartLiteral(); | |
460 } | |
461 ~LiteralScope() { | |
462 if (!complete_) scanner_->DropLiteral(); | |
463 } | |
464 void Complete() { | |
465 scanner_->TerminateLiteral(); | |
466 complete_ = true; | |
467 } | |
468 | |
469 private: | |
470 JavaScriptScanner* scanner_; | |
471 bool complete_; | |
472 }; | |
473 | |
474 explicit JavaScriptScanner(UnicodeCache* scanner_contants); | |
475 | |
476 void Initialize(UC16CharacterStream* source); | |
477 | |
478 // Returns the next token. | |
479 Token::Value Next(); | |
480 | |
481 // Returns true if there was a line terminator before the peek'ed token, | |
482 // possibly inside a multi-line comment. | |
483 bool HasAnyLineTerminatorBeforeNext() const { | |
484 return has_line_terminator_before_next_ || | |
485 has_multiline_comment_before_next_; | |
486 } | |
487 | |
488 // Scans the input as a regular expression pattern, previous | |
489 // character(s) must be /(=). Returns true if a pattern is scanned. | |
490 bool ScanRegExpPattern(bool seen_equal); | |
491 // Returns true if regexp flags are scanned (always since flags can | |
492 // be empty). | |
493 bool ScanRegExpFlags(); | |
494 | |
495 // Tells whether the buffer contains an identifier (no escapes). | |
496 // Used for checking if a property name is an identifier. | |
497 static bool IsIdentifier(unibrow::CharacterStream* buffer); | |
498 | |
499 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence. | |
500 uc32 ScanOctalEscape(uc32 c, int length); | |
501 | |
502 // Returns the location of the last seen octal literal | |
503 Location octal_position() const { return octal_pos_; } | |
504 void clear_octal_position() { octal_pos_ = Location::invalid(); } | |
505 | |
506 // Seek forward to the given position. This operation does not | |
507 // work in general, for instance when there are pushed back | |
508 // characters, but works for seeking forward until simple delimiter | |
509 // tokens, which is what it is used for. | |
510 void SeekForward(int pos); | |
511 | |
512 bool HarmonyBlockScoping() const { | |
513 return harmony_block_scoping_; | |
514 } | |
515 void SetHarmonyBlockScoping(bool block_scoping) { | |
516 harmony_block_scoping_ = block_scoping; | |
517 } | |
518 | |
519 | |
520 protected: | |
521 bool SkipWhiteSpace(); | |
522 Token::Value SkipSingleLineComment(); | |
523 Token::Value SkipMultiLineComment(); | |
524 | |
525 // Scans a single JavaScript token. | |
526 void Scan(); | |
527 | |
528 void ScanDecimalDigits(); | |
529 Token::Value ScanNumber(bool seen_period); | |
530 Token::Value ScanIdentifierOrKeyword(); | |
531 Token::Value ScanIdentifierSuffix(LiteralScope* literal); | |
532 | |
533 void ScanEscape(); | |
534 Token::Value ScanString(); | |
535 | |
536 // Scans a possible HTML comment -- begins with '<!'. | |
537 Token::Value ScanHtmlComment(); | |
538 | |
539 // Decodes a unicode escape-sequence which is part of an identifier. | |
540 // If the escape sequence cannot be decoded the result is kBadChar. | |
541 uc32 ScanIdentifierUnicodeEscape(); | |
542 // Recognizes a uniocde escape-sequence and adds its characters, | |
543 // uninterpreted, to the current literal. Used for parsing RegExp | |
544 // flags. | |
545 bool ScanLiteralUnicodeEscape(); | |
546 | |
547 // Start position of the octal literal last scanned. | |
548 Location octal_pos_; | |
549 | |
550 // Whether there is a line terminator whitespace character after | |
551 // the current token, and before the next. Does not count newlines | |
552 // inside multiline comments. | |
553 bool has_line_terminator_before_next_; | |
554 // Whether there is a multi-line comment that contains a | |
555 // line-terminator after the current token, and before the next. | |
556 bool has_multiline_comment_before_next_; | |
557 // Whether we scan 'let' as a keyword for harmony block scoped | |
558 // let bindings. | |
559 bool harmony_block_scoping_; | |
560 }; | |
561 | |
562 } } // namespace v8::internal | |
563 | |
564 #endif // V8_SCANNER_BASE_H_ | |
OLD | NEW |