OLD | NEW |
1 // Copyright 2010 the V8 project authors. All rights reserved. | 1 // Copyright 2010 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 17 matching lines...) Expand all Loading... |
28 #ifndef V8_SCANNER_H_ | 28 #ifndef V8_SCANNER_H_ |
29 #define V8_SCANNER_H_ | 29 #define V8_SCANNER_H_ |
30 | 30 |
31 #include "token.h" | 31 #include "token.h" |
32 #include "char-predicates-inl.h" | 32 #include "char-predicates-inl.h" |
33 #include "scanner-base.h" | 33 #include "scanner-base.h" |
34 | 34 |
35 namespace v8 { | 35 namespace v8 { |
36 namespace internal { | 36 namespace internal { |
37 | 37 |
38 | |
39 class UTF8Buffer { | |
40 public: | |
41 UTF8Buffer(); | |
42 ~UTF8Buffer(); | |
43 | |
44 inline void AddChar(uc32 c) { | |
45 if (recording_) { | |
46 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { | |
47 buffer_.Add(static_cast<char>(c)); | |
48 } else { | |
49 AddCharSlow(c); | |
50 } | |
51 } | |
52 } | |
53 | |
54 void StartLiteral() { | |
55 buffer_.StartSequence(); | |
56 recording_ = true; | |
57 } | |
58 | |
59 Vector<const char> EndLiteral() { | |
60 if (recording_) { | |
61 recording_ = false; | |
62 buffer_.Add(kEndMarker); | |
63 Vector<char> sequence = buffer_.EndSequence(); | |
64 return Vector<const char>(sequence.start(), sequence.length()); | |
65 } | |
66 return Vector<const char>(); | |
67 } | |
68 | |
69 void DropLiteral() { | |
70 if (recording_) { | |
71 recording_ = false; | |
72 buffer_.DropSequence(); | |
73 } | |
74 } | |
75 | |
76 void Reset() { | |
77 buffer_.Reset(); | |
78 } | |
79 | |
80 // The end marker added after a parsed literal. | |
81 // Using zero allows the usage of strlen and similar functions on | |
82 // identifiers and numbers (but not strings, since they may contain zero | |
83 // bytes). | |
84 // TODO(lrn): Use '\xff' as end marker, since it cannot occur inside | |
85 // an utf-8 string. This requires changes in all places that uses | |
86 // str-functions on the literals, but allows a single pointer to represent | |
87 // the literal, even if it contains embedded zeros. | |
88 static const char kEndMarker = '\x00'; | |
89 private: | |
90 static const int kInitialCapacity = 256; | |
91 SequenceCollector<char, 4> buffer_; | |
92 bool recording_; | |
93 void AddCharSlow(uc32 c); | |
94 }; | |
95 | |
96 | |
97 // UTF16 buffer to read characters from a character stream. | 38 // UTF16 buffer to read characters from a character stream. |
98 class CharacterStreamUTF16Buffer: public UTF16Buffer { | 39 class CharacterStreamUTF16Buffer: public UTF16Buffer { |
99 public: | 40 public: |
100 CharacterStreamUTF16Buffer(); | 41 CharacterStreamUTF16Buffer(); |
101 virtual ~CharacterStreamUTF16Buffer() {} | 42 virtual ~CharacterStreamUTF16Buffer() {} |
102 void Initialize(Handle<String> data, | 43 void Initialize(Handle<String> data, |
103 unibrow::CharacterStream* stream, | 44 unibrow::CharacterStream* stream, |
104 int start_position, | 45 int start_position, |
105 int end_position); | 46 int end_position); |
106 virtual void PushBack(uc32 ch); | 47 virtual void PushBack(uc32 ch); |
(...skipping 20 matching lines...) Expand all Loading... |
127 int end_position); | 68 int end_position); |
128 virtual void PushBack(uc32 ch); | 69 virtual void PushBack(uc32 ch); |
129 virtual uc32 Advance(); | 70 virtual uc32 Advance(); |
130 virtual void SeekForward(int pos); | 71 virtual void SeekForward(int pos); |
131 | 72 |
132 private: | 73 private: |
133 const CharType* raw_data_; // Pointer to the actual array of characters. | 74 const CharType* raw_data_; // Pointer to the actual array of characters. |
134 }; | 75 }; |
135 | 76 |
136 | 77 |
137 enum ParserLanguage { JAVASCRIPT, JSON }; | 78 // Initializes a UTF16Buffer as input stream, using one of a number |
| 79 // of strategies depending on the available character sources. |
| 80 class StreamInitializer { |
| 81 public: |
| 82 UTF16Buffer* Init(Handle<String> source, |
| 83 unibrow::CharacterStream* stream, |
| 84 int start_position, |
| 85 int end_position); |
| 86 private: |
| 87 // Different UTF16 buffers used to pull characters from. Based on input one of |
| 88 // these will be initialized as the actual data source. |
| 89 CharacterStreamUTF16Buffer char_stream_buffer_; |
| 90 ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t> |
| 91 two_byte_string_buffer_; |
| 92 ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_; |
| 93 |
| 94 // Used to convert the source string into a character stream when a stream |
| 95 // is not passed to the scanner. |
| 96 SafeStringInputBuffer safe_string_input_buffer_; |
| 97 }; |
| 98 |
| 99 // ---------------------------------------------------------------------------- |
| 100 // V8JavaScriptScanner |
| 101 // JavaScript scanner getting its input from either a V8 String or a unicode |
| 102 // CharacterStream. |
| 103 |
| 104 class V8JavaScriptScanner : public JavaScriptScanner { |
| 105 public: |
| 106 V8JavaScriptScanner() {} |
| 107 |
| 108 Token::Value NextCheckStack(); |
| 109 |
| 110 // Initialize the Scanner to scan source. |
| 111 void Initialize(Handle<String> source); |
| 112 void Initialize(Handle<String> source, |
| 113 unibrow::CharacterStream* stream); |
| 114 void Initialize(Handle<String> source, |
| 115 int start_position, int end_position); |
| 116 |
| 117 protected: |
| 118 StreamInitializer stream_initializer_; |
| 119 }; |
138 | 120 |
139 | 121 |
140 class Scanner { | 122 class JsonScanner : public Scanner { |
141 public: | 123 public: |
142 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; | 124 JsonScanner(); |
143 | |
144 class LiteralScope { | |
145 public: | |
146 explicit LiteralScope(Scanner* self); | |
147 ~LiteralScope(); | |
148 void Complete(); | |
149 | |
150 private: | |
151 Scanner* scanner_; | |
152 bool complete_; | |
153 }; | |
154 | |
155 Scanner(); | |
156 | 125 |
157 // Initialize the Scanner to scan source. | 126 // Initialize the Scanner to scan source. |
158 void Initialize(Handle<String> source, | 127 void Initialize(Handle<String> source); |
159 ParserLanguage language); | |
160 void Initialize(Handle<String> source, | |
161 unibrow::CharacterStream* stream, | |
162 ParserLanguage language); | |
163 void Initialize(Handle<String> source, | |
164 int start_position, int end_position, | |
165 ParserLanguage language); | |
166 | 128 |
167 // Returns the next token. | 129 // Returns the next token. |
168 Token::Value Next(); | 130 Token::Value Next(); |
169 | 131 |
170 // Returns the current token again. | 132 protected: |
171 Token::Value current_token() { return current_.token; } | 133 // Skip past JSON whitespace (only space, tab, newline and carrige-return). |
172 | |
173 // One token look-ahead (past the token returned by Next()). | |
174 Token::Value peek() const { return next_.token; } | |
175 | |
176 // Returns true if there was a line terminator before the peek'ed token. | |
177 bool has_line_terminator_before_next() const { | |
178 return has_line_terminator_before_next_; | |
179 } | |
180 | |
181 struct Location { | |
182 Location(int b, int e) : beg_pos(b), end_pos(e) { } | |
183 Location() : beg_pos(0), end_pos(0) { } | |
184 int beg_pos; | |
185 int end_pos; | |
186 }; | |
187 | |
188 // Returns the location information for the current token | |
189 // (the token returned by Next()). | |
190 Location location() const { return current_.location; } | |
191 Location peek_location() const { return next_.location; } | |
192 | |
193 // Returns the literal string, if any, for the current token (the | |
194 // token returned by Next()). The string is 0-terminated and in | |
195 // UTF-8 format; they may contain 0-characters. Literal strings are | |
196 // collected for identifiers, strings, and numbers. | |
197 // These functions only give the correct result if the literal | |
198 // was scanned between calls to StartLiteral() and TerminateLiteral(). | |
199 const char* literal_string() const { | |
200 return current_.literal_chars.start(); | |
201 } | |
202 | |
203 int literal_length() const { | |
204 // Excluding terminal '\x00' added by TerminateLiteral(). | |
205 return current_.literal_chars.length() - 1; | |
206 } | |
207 | |
208 Vector<const char> literal() const { | |
209 return Vector<const char>(literal_string(), literal_length()); | |
210 } | |
211 | |
212 // Returns the literal string for the next token (the token that | |
213 // would be returned if Next() were called). | |
214 const char* next_literal_string() const { | |
215 return next_.literal_chars.start(); | |
216 } | |
217 | |
218 | |
219 // Returns the length of the next token (that would be returned if | |
220 // Next() were called). | |
221 int next_literal_length() const { | |
222 // Excluding terminal '\x00' added by TerminateLiteral(). | |
223 return next_.literal_chars.length() - 1; | |
224 } | |
225 | |
226 Vector<const char> next_literal() const { | |
227 return Vector<const char>(next_literal_string(), next_literal_length()); | |
228 } | |
229 | |
230 // Scans the input as a regular expression pattern, previous | |
231 // character(s) must be /(=). Returns true if a pattern is scanned. | |
232 bool ScanRegExpPattern(bool seen_equal); | |
233 // Returns true if regexp flags are scanned (always since flags can | |
234 // be empty). | |
235 bool ScanRegExpFlags(); | |
236 | |
237 // Seek forward to the given position. This operation does not | |
238 // work in general, for instance when there are pushed back | |
239 // characters, but works for seeking forward until simple delimiter | |
240 // tokens, which is what it is used for. | |
241 void SeekForward(int pos); | |
242 | |
243 bool stack_overflow() { return stack_overflow_; } | |
244 | |
245 // Tells whether the buffer contains an identifier (no escapes). | |
246 // Used for checking if a property name is an identifier. | |
247 static bool IsIdentifier(unibrow::CharacterStream* buffer); | |
248 | |
249 static const int kCharacterLookaheadBufferSize = 1; | |
250 static const int kNoEndPosition = 1; | |
251 | |
252 private: | |
253 // The current and look-ahead token. | |
254 struct TokenDesc { | |
255 Token::Value token; | |
256 Location location; | |
257 Vector<const char> literal_chars; | |
258 }; | |
259 | |
260 void Init(Handle<String> source, | |
261 unibrow::CharacterStream* stream, | |
262 int start_position, int end_position, | |
263 ParserLanguage language); | |
264 | |
265 // Literal buffer support | |
266 inline void StartLiteral(); | |
267 inline void AddLiteralChar(uc32 ch); | |
268 inline void AddLiteralCharAdvance(); | |
269 inline void TerminateLiteral(); | |
270 // Stops scanning of a literal, e.g., due to an encountered error. | |
271 inline void DropLiteral(); | |
272 | |
273 // Low-level scanning support. | |
274 void Advance() { c0_ = source_->Advance(); } | |
275 void PushBack(uc32 ch) { | |
276 source_->PushBack(ch); | |
277 c0_ = ch; | |
278 } | |
279 | |
280 bool SkipWhiteSpace() { | |
281 if (is_parsing_json_) { | |
282 return SkipJsonWhiteSpace(); | |
283 } else { | |
284 return SkipJavaScriptWhiteSpace(); | |
285 } | |
286 } | |
287 | |
288 bool SkipJavaScriptWhiteSpace(); | |
289 bool SkipJsonWhiteSpace(); | 134 bool SkipJsonWhiteSpace(); |
290 Token::Value SkipSingleLineComment(); | |
291 Token::Value SkipMultiLineComment(); | |
292 | |
293 inline Token::Value Select(Token::Value tok); | |
294 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_); | |
295 | |
296 inline void Scan() { | |
297 if (is_parsing_json_) { | |
298 ScanJson(); | |
299 } else { | |
300 ScanJavaScript(); | |
301 } | |
302 } | |
303 | |
304 // Scans a single JavaScript token. | |
305 void ScanJavaScript(); | |
306 | 135 |
307 // Scan a single JSON token. The JSON lexical grammar is specified in the | 136 // Scan a single JSON token. The JSON lexical grammar is specified in the |
308 // ECMAScript 5 standard, section 15.12.1.1. | 137 // ECMAScript 5 standard, section 15.12.1.1. |
309 // Recognizes all of the single-character tokens directly, or calls a function | 138 // Recognizes all of the single-character tokens directly, or calls a function |
310 // to scan a number, string or identifier literal. | 139 // to scan a number, string or identifier literal. |
311 // The only allowed whitespace characters between tokens are tab, | 140 // The only allowed whitespace characters between tokens are tab, |
312 // carrige-return, newline and space. | 141 // carrige-return, newline and space. |
313 void ScanJson(); | 142 void ScanJson(); |
314 | 143 |
315 // A JSON number (production JSONNumber) is a subset of the valid JavaScript | 144 // A JSON number (production JSONNumber) is a subset of the valid JavaScript |
316 // decimal number literals. | 145 // decimal number literals. |
317 // It includes an optional minus sign, must have at least one | 146 // It includes an optional minus sign, must have at least one |
318 // digit before and after a decimal point, may not have prefixed zeros (unless | 147 // digit before and after a decimal point, may not have prefixed zeros (unless |
319 // the integer part is zero), and may include an exponent part (e.g., "e-10"). | 148 // the integer part is zero), and may include an exponent part (e.g., "e-10"). |
320 // Hexadecimal and octal numbers are not allowed. | 149 // Hexadecimal and octal numbers are not allowed. |
321 Token::Value ScanJsonNumber(); | 150 Token::Value ScanJsonNumber(); |
322 | 151 |
323 // A JSON string (production JSONString) is subset of valid JavaScript string | 152 // A JSON string (production JSONString) is subset of valid JavaScript string |
324 // literals. The string must only be double-quoted (not single-quoted), and | 153 // literals. The string must only be double-quoted (not single-quoted), and |
325 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and | 154 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and |
326 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid. | 155 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid. |
327 Token::Value ScanJsonString(); | 156 Token::Value ScanJsonString(); |
328 | 157 |
329 // Used to recognizes one of the literals "true", "false", or "null". These | 158 // Used to recognizes one of the literals "true", "false", or "null". These |
330 // are the only valid JSON identifiers (productions JSONBooleanLiteral, | 159 // are the only valid JSON identifiers (productions JSONBooleanLiteral, |
331 // JSONNullLiteral). | 160 // JSONNullLiteral). |
332 Token::Value ScanJsonIdentifier(const char* text, Token::Value token); | 161 Token::Value ScanJsonIdentifier(const char* text, Token::Value token); |
333 | 162 |
334 void ScanDecimalDigits(); | 163 StreamInitializer stream_initializer_; |
335 Token::Value ScanNumber(bool seen_period); | |
336 Token::Value ScanIdentifier(); | |
337 uc32 ScanHexEscape(uc32 c, int length); | |
338 uc32 ScanOctalEscape(uc32 c, int length); | |
339 void ScanEscape(); | |
340 Token::Value ScanString(); | |
341 | |
342 // Scans a possible HTML comment -- begins with '<!'. | |
343 Token::Value ScanHtmlComment(); | |
344 | |
345 // Return the current source position. | |
346 int source_pos() { | |
347 return source_->pos() - kCharacterLookaheadBufferSize; | |
348 } | |
349 | |
350 // Decodes a unicode escape-sequence which is part of an identifier. | |
351 // If the escape sequence cannot be decoded the result is kBadRune. | |
352 uc32 ScanIdentifierUnicodeEscape(); | |
353 | |
354 TokenDesc current_; // desc for current token (as returned by Next()) | |
355 TokenDesc next_; // desc for next token (one token look-ahead) | |
356 bool has_line_terminator_before_next_; | |
357 bool is_parsing_json_; | |
358 | |
359 // Different UTF16 buffers used to pull characters from. Based on input one of | |
360 // these will be initialized as the actual data source. | |
361 CharacterStreamUTF16Buffer char_stream_buffer_; | |
362 ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t> | |
363 two_byte_string_buffer_; | |
364 ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_; | |
365 | |
366 // Source. Will point to one of the buffers declared above. | |
367 UTF16Buffer* source_; | |
368 | |
369 // Used to convert the source string into a character stream when a stream | |
370 // is not passed to the scanner. | |
371 SafeStringInputBuffer safe_string_input_buffer_; | |
372 | |
373 // Buffer to hold literal values (identifiers, strings, numbers) | |
374 // using '\x00'-terminated UTF-8 encoding. Handles allocation internally. | |
375 UTF8Buffer literal_buffer_; | |
376 | |
377 bool stack_overflow_; | |
378 | |
379 // One Unicode character look-ahead; c0_ < 0 at the end of the input. | |
380 uc32 c0_; | |
381 }; | 164 }; |
382 | 165 |
383 | 166 |
384 // ExternalStringUTF16Buffer | 167 // ExternalStringUTF16Buffer |
385 template <typename StringType, typename CharType> | 168 template <typename StringType, typename CharType> |
386 ExternalStringUTF16Buffer<StringType, CharType>::ExternalStringUTF16Buffer() | 169 ExternalStringUTF16Buffer<StringType, CharType>::ExternalStringUTF16Buffer() |
387 : raw_data_(NULL) { } | 170 : raw_data_(NULL) { } |
388 | 171 |
389 | 172 |
390 template <typename StringType, typename CharType> | 173 template <typename StringType, typename CharType> |
391 void ExternalStringUTF16Buffer<StringType, CharType>::Initialize( | 174 void ExternalStringUTF16Buffer<StringType, CharType>::Initialize( |
392 Handle<StringType> data, | 175 Handle<StringType> data, |
393 int start_position, | 176 int start_position, |
394 int end_position) { | 177 int end_position) { |
395 ASSERT(!data.is_null()); | 178 ASSERT(!data.is_null()); |
396 raw_data_ = data->resource()->data(); | 179 raw_data_ = data->resource()->data(); |
397 | 180 |
398 ASSERT(end_position <= data->length()); | 181 ASSERT(end_position <= data->length()); |
399 if (start_position > 0) { | 182 if (start_position > 0) { |
400 SeekForward(start_position); | 183 SeekForward(start_position); |
401 } | 184 } |
402 end_ = | 185 end_ = |
403 end_position != Scanner::kNoEndPosition ? end_position : data->length(); | 186 end_position != kNoEndPosition ? end_position : data->length(); |
404 } | 187 } |
405 | 188 |
406 | 189 |
407 template <typename StringType, typename CharType> | 190 template <typename StringType, typename CharType> |
408 uc32 ExternalStringUTF16Buffer<StringType, CharType>::Advance() { | 191 uc32 ExternalStringUTF16Buffer<StringType, CharType>::Advance() { |
409 if (pos_ < end_) { | 192 if (pos_ < end_) { |
410 return raw_data_[pos_++]; | 193 return raw_data_[pos_++]; |
411 } else { | 194 } else { |
412 // note: currently the following increment is necessary to avoid a | 195 // note: currently the following increment is necessary to avoid a |
413 // test-parser problem! | 196 // test-parser problem! |
(...skipping 12 matching lines...) Expand all Loading... |
426 | 209 |
427 | 210 |
428 template <typename StringType, typename CharType> | 211 template <typename StringType, typename CharType> |
429 void ExternalStringUTF16Buffer<StringType, CharType>::SeekForward(int pos) { | 212 void ExternalStringUTF16Buffer<StringType, CharType>::SeekForward(int pos) { |
430 pos_ = pos; | 213 pos_ = pos; |
431 } | 214 } |
432 | 215 |
433 } } // namespace v8::internal | 216 } } // namespace v8::internal |
434 | 217 |
435 #endif // V8_SCANNER_H_ | 218 #endif // V8_SCANNER_H_ |
OLD | NEW |