OLD | NEW |
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Features shared by parsing and pre-parsing scanners. | 5 // Features shared by parsing and pre-parsing scanners. |
6 | 6 |
7 #ifndef V8_PARSING_SCANNER_H_ | 7 #ifndef V8_PARSING_SCANNER_H_ |
8 #define V8_PARSING_SCANNER_H_ | 8 #define V8_PARSING_SCANNER_H_ |
9 | 9 |
10 #include "src/allocation.h" | 10 #include "src/allocation.h" |
11 #include "src/base/logging.h" | 11 #include "src/base/logging.h" |
12 #include "src/char-predicates.h" | 12 #include "src/char-predicates.h" |
13 #include "src/globals.h" | 13 #include "src/globals.h" |
14 #include "src/messages.h" | 14 #include "src/messages.h" |
15 #include "src/parsing/token.h" | 15 #include "src/parsing/token.h" |
16 #include "src/unicode-decoder.h" | 16 #include "src/unicode-decoder.h" |
17 #include "src/unicode.h" | 17 #include "src/unicode.h" |
18 | 18 |
19 namespace v8 { | 19 namespace v8 { |
20 namespace internal { | 20 namespace internal { |
21 | 21 |
22 | 22 |
23 class AstRawString; | 23 class AstRawString; |
24 class AstValueFactory; | 24 class AstValueFactory; |
25 class DuplicateFinder; | 25 class DuplicateFinder; |
| 26 class ExternalOneByteString; |
| 27 class ExternalTwoByteString; |
26 class ParserRecorder; | 28 class ParserRecorder; |
27 class UnicodeCache; | 29 class UnicodeCache; |
28 | 30 |
29 | |
30 // --------------------------------------------------------------------- | 31 // --------------------------------------------------------------------- |
31 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer. | 32 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer. |
32 // A code unit is a 16 bit value representing either a 16 bit code point | 33 // A code unit is a 16 bit value representing either a 16 bit code point |
33 // or one part of a surrogate pair that make a single 21 bit code point. | 34 // or one part of a surrogate pair that make a single 21 bit code point. |
34 | |
35 class Utf16CharacterStream { | 35 class Utf16CharacterStream { |
36 public: | 36 public: |
37 Utf16CharacterStream() : pos_(0) { } | 37 static const uc32 kEndOfInput = -1; |
| 38 |
38 virtual ~Utf16CharacterStream() { } | 39 virtual ~Utf16CharacterStream() { } |
39 | 40 |
40 // Returns and advances past the next UTF-16 code unit in the input | 41 // Returns and advances past the next UTF-16 code unit in the input |
41 // stream. If there are no more code units, it returns a negative | 42 // stream. If there are no more code units it returns kEndOfInput. |
42 // value. | |
43 inline uc32 Advance() { | 43 inline uc32 Advance() { |
44 if (buffer_cursor_ < buffer_end_ || ReadBlock()) { | 44 if (V8_LIKELY(buffer_cursor_ < buffer_end_)) { |
45 pos_++; | |
46 return static_cast<uc32>(*(buffer_cursor_++)); | 45 return static_cast<uc32>(*(buffer_cursor_++)); |
| 46 } else if (ReadBlock()) { |
| 47 return static_cast<uc32>(*(buffer_cursor_++)); |
| 48 } else { |
| 49 // Note: currently the following increment is necessary to avoid a |
| 50 // parser problem! The scanner treats the final kEndOfInput as |
| 51 // a code unit with a position, and does math relative to that |
| 52 // position. |
| 53 buffer_cursor_++; |
| 54 return kEndOfInput; |
47 } | 55 } |
48 // Note: currently the following increment is necessary to avoid a | |
49 // parser problem! The scanner treats the final kEndOfInput as | |
50 // a code unit with a position, and does math relative to that | |
51 // position. | |
52 pos_++; | |
53 | |
54 return kEndOfInput; | |
55 } | 56 } |
56 | 57 |
57 // Return the current position in the code unit stream. | 58 // Go back one by one character in the input stream. |
58 // Starts at zero. | 59 // This undoes the most recent Advance(). |
59 inline size_t pos() const { return pos_; } | 60 inline void Back() { |
60 | 61 // The common case - if the previous character is within |
61 // Skips forward past the next code_unit_count UTF-16 code units | 62 // buffer_start_ .. buffer_end_ will be handles locally. |
62 // in the input, or until the end of input if that comes sooner. | 63 // Otherwise, a new block is requested. |
63 // Returns the number of code units actually skipped. If less | 64 if (V8_LIKELY(buffer_cursor_ > buffer_start_)) { |
64 // than code_unit_count, | 65 buffer_cursor_--; |
65 inline size_t SeekForward(size_t code_unit_count) { | 66 } else { |
66 size_t buffered_chars = buffer_end_ - buffer_cursor_; | 67 ReadBlockAt(pos() - 1); |
67 if (code_unit_count <= buffered_chars) { | |
68 buffer_cursor_ += code_unit_count; | |
69 pos_ += code_unit_count; | |
70 return code_unit_count; | |
71 } | 68 } |
72 return SlowSeekForward(code_unit_count); | |
73 } | 69 } |
74 | 70 |
75 // Pushes back the most recently read UTF-16 code unit (or negative | 71 // Go back one by two characters in the input stream. (This is the same as |
76 // value if at end of input), i.e., the value returned by the most recent | 72 // calling Back() twice. But Back() may - in some instances - do substantial |
77 // call to Advance. | 73 // work. Back2() guarantees this work will be done only once.) |
78 // Must not be used right after calling SeekForward. | 74 inline void Back2() { |
79 virtual void PushBack(int32_t code_unit) = 0; | 75 if (V8_LIKELY(buffer_cursor_ - 2 >= buffer_start_)) { |
| 76 buffer_cursor_ -= 2; |
| 77 } else { |
| 78 ReadBlockAt(pos() - 2); |
| 79 } |
| 80 } |
80 | 81 |
81 virtual bool SetBookmark(); | 82 inline size_t pos() const { |
82 virtual void ResetToBookmark(); | 83 return buffer_pos_ + (buffer_cursor_ - buffer_start_); |
| 84 } |
| 85 |
| 86 inline void Seek(size_t pos) { |
| 87 if (V8_LIKELY(pos >= buffer_pos_ && |
| 88 pos < (buffer_pos_ + (buffer_end_ - buffer_start_)))) { |
| 89 buffer_cursor_ = buffer_start_ + (pos - buffer_pos_); |
| 90 } else { |
| 91 ReadBlockAt(pos); |
| 92 } |
| 93 } |
| 94 |
| 95 // Legacy API: |
| 96 void SeekForward(size_t code_unit_count) { Seek(pos() + code_unit_count); } |
| 97 void PushBack(int32_t code_unit) { |
| 98 Back(); |
| 99 #ifdef DEBUG |
| 100 uc32 t = Advance(); |
| 101 DCHECK_EQ(t, code_unit); |
| 102 Back(); |
| 103 #endif // DEBUG |
| 104 } |
| 105 void PushBack2(int32_t code_unit_back_1, int32_t code_unit_back_2) { |
| 106 Back2(); |
| 107 #ifdef DEBUG |
| 108 DCHECK_EQ(Advance(), code_unit_back_2); |
| 109 DCHECK_EQ(Advance(), code_unit_back_1); |
| 110 Back2(); |
| 111 #endif // DEBUG |
| 112 } |
| 113 bool SetBookmark() { |
| 114 bookmark_ = pos(); |
| 115 return true; |
| 116 } |
| 117 void ResetToBookmark() { |
| 118 DCHECK_NE(bookmark_, kNoBookmark); |
| 119 Seek(bookmark_); |
| 120 } |
83 | 121 |
84 protected: | 122 protected: |
85 static const uc32 kEndOfInput = -1; | 123 static const size_t kNoBookmark; |
86 | 124 |
87 // Ensures that the buffer_cursor_ points to the code_unit at | 125 Utf16CharacterStream(const uint16_t* buffer_start, |
88 // position pos_ of the input, if possible. If the position | 126 const uint16_t* buffer_cursor, |
89 // is at or after the end of the input, return false. If there | 127 const uint16_t* buffer_end, size_t buffer_pos) |
90 // are more code_units available, return true. | 128 : buffer_start_(buffer_start), |
| 129 buffer_cursor_(buffer_cursor), |
| 130 buffer_end_(buffer_end), |
| 131 buffer_pos_(buffer_pos), |
| 132 bookmark_(kNoBookmark) {} |
| 133 Utf16CharacterStream() : Utf16CharacterStream(nullptr, nullptr, nullptr, 0) {} |
| 134 |
| 135 void ReadBlockAt(size_t new_pos) { |
| 136 // The callers of this method (Back/Back2/Seek) should handle the easy |
| 137 // case (seeking within the current buffer), and we should only get here |
| 138 // if we actually require new data. |
| 139 // (This is really an efficiency check, not a correctness invariant.) |
| 140 DCHECK(new_pos < buffer_pos_ || |
| 141 new_pos >= buffer_pos_ + (buffer_end_ - buffer_start_)); |
| 142 |
| 143 // Change pos() to point to new_pos. |
| 144 buffer_pos_ = new_pos; |
| 145 buffer_cursor_ = buffer_start_; |
| 146 bool success = ReadBlock(); |
| 147 USE(success); |
| 148 |
| 149 // Post-conditions: 1, on success, we should be at the right position. |
| 150 // 2, success == we should have more characters available. |
| 151 DCHECK_IMPLIES(success, pos() == new_pos); |
| 152 DCHECK_EQ(success, buffer_cursor_ < buffer_end_); |
| 153 DCHECK_EQ(success, buffer_start_ < buffer_end_); |
| 154 } |
| 155 |
| 156 // Read more data, and update buffer_*_ to point to it. |
| 157 // Returns true if more data was available. |
| 158 // |
| 159 // ReadBlock() may modify any of the buffer_*_ members, but must sure that |
| 160 // the result of pos() remains unaffected. |
| 161 // |
| 162 // Examples: |
| 163 // - a stream could either fill a separate buffer. Then buffer_start_ and |
| 164 // buffer_cursor_ would point to the beginning of the buffer, and |
| 165 // buffer_pos would be the old pos(). |
| 166 // - a stream with existing buffer chunks would set buffer_start_ and |
| 167 // buffer_end_ to cover the full chunk, and then buffer_cursor_ would |
| 168 // point into the middle of the buffer, while buffer_pos_ would describe |
| 169 // the start of the buffer. |
91 virtual bool ReadBlock() = 0; | 170 virtual bool ReadBlock() = 0; |
92 virtual size_t SlowSeekForward(size_t code_unit_count) = 0; | |
93 | 171 |
| 172 const uint16_t* buffer_start_; |
94 const uint16_t* buffer_cursor_; | 173 const uint16_t* buffer_cursor_; |
95 const uint16_t* buffer_end_; | 174 const uint16_t* buffer_end_; |
96 size_t pos_; | 175 size_t buffer_pos_; |
| 176 size_t bookmark_; |
97 }; | 177 }; |
98 | 178 |
99 | 179 |
100 // ---------------------------------------------------------------------------- | 180 // ---------------------------------------------------------------------------- |
101 // JavaScript Scanner. | 181 // JavaScript Scanner. |
102 | 182 |
103 class Scanner { | 183 class Scanner { |
104 public: | 184 public: |
105 // Scoped helper for a re-settable bookmark. | 185 // Scoped helper for a re-settable bookmark. |
106 class BookmarkScope { | 186 class BookmarkScope { |
(...skipping 24 matching lines...) Expand all Loading... |
131 } | 211 } |
132 | 212 |
133 static Location invalid() { return Location(-1, -1); } | 213 static Location invalid() { return Location(-1, -1); } |
134 | 214 |
135 int beg_pos; | 215 int beg_pos; |
136 int end_pos; | 216 int end_pos; |
137 }; | 217 }; |
138 | 218 |
139 // -1 is outside of the range of any real source code. | 219 // -1 is outside of the range of any real source code. |
140 static const int kNoOctalLocation = -1; | 220 static const int kNoOctalLocation = -1; |
| 221 static const uc32 kEndOfInput = Utf16CharacterStream::kEndOfInput; |
141 | 222 |
142 explicit Scanner(UnicodeCache* scanner_contants); | 223 explicit Scanner(UnicodeCache* scanner_contants); |
143 | 224 |
144 void Initialize(Utf16CharacterStream* source); | 225 void Initialize(Utf16CharacterStream* source); |
145 | 226 |
146 // Returns the next token and advances input. | 227 // Returns the next token and advances input. |
147 Token::Value Next(); | 228 Token::Value Next(); |
148 // Returns the token following peek() | 229 // Returns the token following peek() |
149 Token::Value PeekAhead(); | 230 Token::Value PeekAhead(); |
150 // Returns the current token again. | 231 // Returns the current token again. |
(...skipping 641 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
792 bool found_html_comment_; | 873 bool found_html_comment_; |
793 | 874 |
794 MessageTemplate::Template scanner_error_; | 875 MessageTemplate::Template scanner_error_; |
795 Location scanner_error_location_; | 876 Location scanner_error_location_; |
796 }; | 877 }; |
797 | 878 |
798 } // namespace internal | 879 } // namespace internal |
799 } // namespace v8 | 880 } // namespace v8 |
800 | 881 |
801 #endif // V8_PARSING_SCANNER_H_ | 882 #endif // V8_PARSING_SCANNER_H_ |
OLD | NEW |