Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(38)

Side by Side Diff: src/scanner.h

Issue 5136002: Extract scanner base/JS/JSON and move base and JS to scanner-base. (Closed)
Patch Set: Created 10 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/prescanner.h ('k') | src/scanner.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2010 the V8 project authors. All rights reserved. 1 // Copyright 2010 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 17 matching lines...) Expand all
28 #ifndef V8_SCANNER_H_ 28 #ifndef V8_SCANNER_H_
29 #define V8_SCANNER_H_ 29 #define V8_SCANNER_H_
30 30
31 #include "token.h" 31 #include "token.h"
32 #include "char-predicates-inl.h" 32 #include "char-predicates-inl.h"
33 #include "scanner-base.h" 33 #include "scanner-base.h"
34 34
35 namespace v8 { 35 namespace v8 {
36 namespace internal { 36 namespace internal {
37 37
38
39 class UTF8Buffer {
40 public:
41 UTF8Buffer();
42 ~UTF8Buffer();
43
44 inline void AddChar(uc32 c) {
45 if (recording_) {
46 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
47 buffer_.Add(static_cast<char>(c));
48 } else {
49 AddCharSlow(c);
50 }
51 }
52 }
53
54 void StartLiteral() {
55 buffer_.StartSequence();
56 recording_ = true;
57 }
58
59 Vector<const char> EndLiteral() {
60 if (recording_) {
61 recording_ = false;
62 buffer_.Add(kEndMarker);
63 Vector<char> sequence = buffer_.EndSequence();
64 return Vector<const char>(sequence.start(), sequence.length());
65 }
66 return Vector<const char>();
67 }
68
69 void DropLiteral() {
70 if (recording_) {
71 recording_ = false;
72 buffer_.DropSequence();
73 }
74 }
75
76 void Reset() {
77 buffer_.Reset();
78 }
79
80 // The end marker added after a parsed literal.
81 // Using zero allows the usage of strlen and similar functions on
82 // identifiers and numbers (but not strings, since they may contain zero
83 // bytes).
84 // TODO(lrn): Use '\xff' as end marker, since it cannot occur inside
85 // an utf-8 string. This requires changes in all places that uses
86 // str-functions on the literals, but allows a single pointer to represent
87 // the literal, even if it contains embedded zeros.
88 static const char kEndMarker = '\x00';
89 private:
90 static const int kInitialCapacity = 256;
91 SequenceCollector<char, 4> buffer_;
92 bool recording_;
93 void AddCharSlow(uc32 c);
94 };
95
96
97 // UTF16 buffer to read characters from a character stream. 38 // UTF16 buffer to read characters from a character stream.
98 class CharacterStreamUTF16Buffer: public UTF16Buffer { 39 class CharacterStreamUTF16Buffer: public UTF16Buffer {
99 public: 40 public:
100 CharacterStreamUTF16Buffer(); 41 CharacterStreamUTF16Buffer();
101 virtual ~CharacterStreamUTF16Buffer() {} 42 virtual ~CharacterStreamUTF16Buffer() {}
102 void Initialize(Handle<String> data, 43 void Initialize(Handle<String> data,
103 unibrow::CharacterStream* stream, 44 unibrow::CharacterStream* stream,
104 int start_position, 45 int start_position,
105 int end_position); 46 int end_position);
106 virtual void PushBack(uc32 ch); 47 virtual void PushBack(uc32 ch);
(...skipping 20 matching lines...) Expand all
127 int end_position); 68 int end_position);
128 virtual void PushBack(uc32 ch); 69 virtual void PushBack(uc32 ch);
129 virtual uc32 Advance(); 70 virtual uc32 Advance();
130 virtual void SeekForward(int pos); 71 virtual void SeekForward(int pos);
131 72
132 private: 73 private:
133 const CharType* raw_data_; // Pointer to the actual array of characters. 74 const CharType* raw_data_; // Pointer to the actual array of characters.
134 }; 75 };
135 76
136 77
137 enum ParserLanguage { JAVASCRIPT, JSON }; 78 // Initializes a UTF16Buffer as input stream, using one of a number
79 // of strategies depending on the available character sources.
80 class StreamInitializer {
81 public:
82 UTF16Buffer* Init(Handle<String> source,
83 unibrow::CharacterStream* stream,
84 int start_position,
85 int end_position);
86 private:
87 // Different UTF16 buffers used to pull characters from. Based on input one of
88 // these will be initialized as the actual data source.
89 CharacterStreamUTF16Buffer char_stream_buffer_;
90 ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t>
91 two_byte_string_buffer_;
92 ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_;
93
94 // Used to convert the source string into a character stream when a stream
95 // is not passed to the scanner.
96 SafeStringInputBuffer safe_string_input_buffer_;
97 };
98
99 // ----------------------------------------------------------------------------
100 // V8JavaScriptScanner
101 // JavaScript scanner getting its input from either a V8 String or a unicode
102 // CharacterStream.
103
104 class V8JavaScriptScanner : public JavaScriptScanner {
105 public:
106 V8JavaScriptScanner() {}
107
108 Token::Value NextCheckStack();
109
110 // Initialize the Scanner to scan source.
111 void Initialize(Handle<String> source);
112 void Initialize(Handle<String> source,
113 unibrow::CharacterStream* stream);
114 void Initialize(Handle<String> source,
115 int start_position, int end_position);
116
117 protected:
118 StreamInitializer stream_initializer_;
119 };
138 120
139 121
140 class Scanner { 122 class JsonScanner : public Scanner {
141 public: 123 public:
142 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; 124 JsonScanner();
143
144 class LiteralScope {
145 public:
146 explicit LiteralScope(Scanner* self);
147 ~LiteralScope();
148 void Complete();
149
150 private:
151 Scanner* scanner_;
152 bool complete_;
153 };
154
155 Scanner();
156 125
157 // Initialize the Scanner to scan source. 126 // Initialize the Scanner to scan source.
158 void Initialize(Handle<String> source, 127 void Initialize(Handle<String> source);
159 ParserLanguage language);
160 void Initialize(Handle<String> source,
161 unibrow::CharacterStream* stream,
162 ParserLanguage language);
163 void Initialize(Handle<String> source,
164 int start_position, int end_position,
165 ParserLanguage language);
166 128
167 // Returns the next token. 129 // Returns the next token.
168 Token::Value Next(); 130 Token::Value Next();
169 131
170 // Returns the current token again. 132 protected:
171 Token::Value current_token() { return current_.token; } 133 // Skip past JSON whitespace (only space, tab, newline and carrige-return).
172
173 // One token look-ahead (past the token returned by Next()).
174 Token::Value peek() const { return next_.token; }
175
176 // Returns true if there was a line terminator before the peek'ed token.
177 bool has_line_terminator_before_next() const {
178 return has_line_terminator_before_next_;
179 }
180
181 struct Location {
182 Location(int b, int e) : beg_pos(b), end_pos(e) { }
183 Location() : beg_pos(0), end_pos(0) { }
184 int beg_pos;
185 int end_pos;
186 };
187
188 // Returns the location information for the current token
189 // (the token returned by Next()).
190 Location location() const { return current_.location; }
191 Location peek_location() const { return next_.location; }
192
193 // Returns the literal string, if any, for the current token (the
194 // token returned by Next()). The string is 0-terminated and in
195 // UTF-8 format; they may contain 0-characters. Literal strings are
196 // collected for identifiers, strings, and numbers.
197 // These functions only give the correct result if the literal
198 // was scanned between calls to StartLiteral() and TerminateLiteral().
199 const char* literal_string() const {
200 return current_.literal_chars.start();
201 }
202
203 int literal_length() const {
204 // Excluding terminal '\x00' added by TerminateLiteral().
205 return current_.literal_chars.length() - 1;
206 }
207
208 Vector<const char> literal() const {
209 return Vector<const char>(literal_string(), literal_length());
210 }
211
212 // Returns the literal string for the next token (the token that
213 // would be returned if Next() were called).
214 const char* next_literal_string() const {
215 return next_.literal_chars.start();
216 }
217
218
219 // Returns the length of the next token (that would be returned if
220 // Next() were called).
221 int next_literal_length() const {
222 // Excluding terminal '\x00' added by TerminateLiteral().
223 return next_.literal_chars.length() - 1;
224 }
225
226 Vector<const char> next_literal() const {
227 return Vector<const char>(next_literal_string(), next_literal_length());
228 }
229
230 // Scans the input as a regular expression pattern, previous
231 // character(s) must be /(=). Returns true if a pattern is scanned.
232 bool ScanRegExpPattern(bool seen_equal);
233 // Returns true if regexp flags are scanned (always since flags can
234 // be empty).
235 bool ScanRegExpFlags();
236
237 // Seek forward to the given position. This operation does not
238 // work in general, for instance when there are pushed back
239 // characters, but works for seeking forward until simple delimiter
240 // tokens, which is what it is used for.
241 void SeekForward(int pos);
242
243 bool stack_overflow() { return stack_overflow_; }
244
245 // Tells whether the buffer contains an identifier (no escapes).
246 // Used for checking if a property name is an identifier.
247 static bool IsIdentifier(unibrow::CharacterStream* buffer);
248
249 static const int kCharacterLookaheadBufferSize = 1;
250 static const int kNoEndPosition = 1;
251
252 private:
253 // The current and look-ahead token.
254 struct TokenDesc {
255 Token::Value token;
256 Location location;
257 Vector<const char> literal_chars;
258 };
259
260 void Init(Handle<String> source,
261 unibrow::CharacterStream* stream,
262 int start_position, int end_position,
263 ParserLanguage language);
264
265 // Literal buffer support
266 inline void StartLiteral();
267 inline void AddLiteralChar(uc32 ch);
268 inline void AddLiteralCharAdvance();
269 inline void TerminateLiteral();
270 // Stops scanning of a literal, e.g., due to an encountered error.
271 inline void DropLiteral();
272
273 // Low-level scanning support.
274 void Advance() { c0_ = source_->Advance(); }
275 void PushBack(uc32 ch) {
276 source_->PushBack(ch);
277 c0_ = ch;
278 }
279
280 bool SkipWhiteSpace() {
281 if (is_parsing_json_) {
282 return SkipJsonWhiteSpace();
283 } else {
284 return SkipJavaScriptWhiteSpace();
285 }
286 }
287
288 bool SkipJavaScriptWhiteSpace();
289 bool SkipJsonWhiteSpace(); 134 bool SkipJsonWhiteSpace();
290 Token::Value SkipSingleLineComment();
291 Token::Value SkipMultiLineComment();
292
293 inline Token::Value Select(Token::Value tok);
294 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_);
295
296 inline void Scan() {
297 if (is_parsing_json_) {
298 ScanJson();
299 } else {
300 ScanJavaScript();
301 }
302 }
303
304 // Scans a single JavaScript token.
305 void ScanJavaScript();
306 135
307 // Scan a single JSON token. The JSON lexical grammar is specified in the 136 // Scan a single JSON token. The JSON lexical grammar is specified in the
308 // ECMAScript 5 standard, section 15.12.1.1. 137 // ECMAScript 5 standard, section 15.12.1.1.
309 // Recognizes all of the single-character tokens directly, or calls a function 138 // Recognizes all of the single-character tokens directly, or calls a function
310 // to scan a number, string or identifier literal. 139 // to scan a number, string or identifier literal.
311 // The only allowed whitespace characters between tokens are tab, 140 // The only allowed whitespace characters between tokens are tab,
312 // carrige-return, newline and space. 141 // carrige-return, newline and space.
313 void ScanJson(); 142 void ScanJson();
314 143
315 // A JSON number (production JSONNumber) is a subset of the valid JavaScript 144 // A JSON number (production JSONNumber) is a subset of the valid JavaScript
316 // decimal number literals. 145 // decimal number literals.
317 // It includes an optional minus sign, must have at least one 146 // It includes an optional minus sign, must have at least one
318 // digit before and after a decimal point, may not have prefixed zeros (unless 147 // digit before and after a decimal point, may not have prefixed zeros (unless
319 // the integer part is zero), and may include an exponent part (e.g., "e-10"). 148 // the integer part is zero), and may include an exponent part (e.g., "e-10").
320 // Hexadecimal and octal numbers are not allowed. 149 // Hexadecimal and octal numbers are not allowed.
321 Token::Value ScanJsonNumber(); 150 Token::Value ScanJsonNumber();
322 151
323 // A JSON string (production JSONString) is subset of valid JavaScript string 152 // A JSON string (production JSONString) is subset of valid JavaScript string
324 // literals. The string must only be double-quoted (not single-quoted), and 153 // literals. The string must only be double-quoted (not single-quoted), and
325 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and 154 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and
326 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid. 155 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.
327 Token::Value ScanJsonString(); 156 Token::Value ScanJsonString();
328 157
329 // Used to recognizes one of the literals "true", "false", or "null". These 158 // Used to recognizes one of the literals "true", "false", or "null". These
330 // are the only valid JSON identifiers (productions JSONBooleanLiteral, 159 // are the only valid JSON identifiers (productions JSONBooleanLiteral,
331 // JSONNullLiteral). 160 // JSONNullLiteral).
332 Token::Value ScanJsonIdentifier(const char* text, Token::Value token); 161 Token::Value ScanJsonIdentifier(const char* text, Token::Value token);
333 162
334 void ScanDecimalDigits(); 163 StreamInitializer stream_initializer_;
335 Token::Value ScanNumber(bool seen_period);
336 Token::Value ScanIdentifier();
337 uc32 ScanHexEscape(uc32 c, int length);
338 uc32 ScanOctalEscape(uc32 c, int length);
339 void ScanEscape();
340 Token::Value ScanString();
341
342 // Scans a possible HTML comment -- begins with '<!'.
343 Token::Value ScanHtmlComment();
344
345 // Return the current source position.
346 int source_pos() {
347 return source_->pos() - kCharacterLookaheadBufferSize;
348 }
349
350 // Decodes a unicode escape-sequence which is part of an identifier.
351 // If the escape sequence cannot be decoded the result is kBadRune.
352 uc32 ScanIdentifierUnicodeEscape();
353
354 TokenDesc current_; // desc for current token (as returned by Next())
355 TokenDesc next_; // desc for next token (one token look-ahead)
356 bool has_line_terminator_before_next_;
357 bool is_parsing_json_;
358
359 // Different UTF16 buffers used to pull characters from. Based on input one of
360 // these will be initialized as the actual data source.
361 CharacterStreamUTF16Buffer char_stream_buffer_;
362 ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t>
363 two_byte_string_buffer_;
364 ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_;
365
366 // Source. Will point to one of the buffers declared above.
367 UTF16Buffer* source_;
368
369 // Used to convert the source string into a character stream when a stream
370 // is not passed to the scanner.
371 SafeStringInputBuffer safe_string_input_buffer_;
372
373 // Buffer to hold literal values (identifiers, strings, numbers)
374 // using '\x00'-terminated UTF-8 encoding. Handles allocation internally.
375 UTF8Buffer literal_buffer_;
376
377 bool stack_overflow_;
378
379 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
380 uc32 c0_;
381 }; 164 };
382 165
383 166
384 // ExternalStringUTF16Buffer 167 // ExternalStringUTF16Buffer
385 template <typename StringType, typename CharType> 168 template <typename StringType, typename CharType>
386 ExternalStringUTF16Buffer<StringType, CharType>::ExternalStringUTF16Buffer() 169 ExternalStringUTF16Buffer<StringType, CharType>::ExternalStringUTF16Buffer()
387 : raw_data_(NULL) { } 170 : raw_data_(NULL) { }
388 171
389 172
390 template <typename StringType, typename CharType> 173 template <typename StringType, typename CharType>
391 void ExternalStringUTF16Buffer<StringType, CharType>::Initialize( 174 void ExternalStringUTF16Buffer<StringType, CharType>::Initialize(
392 Handle<StringType> data, 175 Handle<StringType> data,
393 int start_position, 176 int start_position,
394 int end_position) { 177 int end_position) {
395 ASSERT(!data.is_null()); 178 ASSERT(!data.is_null());
396 raw_data_ = data->resource()->data(); 179 raw_data_ = data->resource()->data();
397 180
398 ASSERT(end_position <= data->length()); 181 ASSERT(end_position <= data->length());
399 if (start_position > 0) { 182 if (start_position > 0) {
400 SeekForward(start_position); 183 SeekForward(start_position);
401 } 184 }
402 end_ = 185 end_ =
403 end_position != Scanner::kNoEndPosition ? end_position : data->length(); 186 end_position != kNoEndPosition ? end_position : data->length();
404 } 187 }
405 188
406 189
407 template <typename StringType, typename CharType> 190 template <typename StringType, typename CharType>
408 uc32 ExternalStringUTF16Buffer<StringType, CharType>::Advance() { 191 uc32 ExternalStringUTF16Buffer<StringType, CharType>::Advance() {
409 if (pos_ < end_) { 192 if (pos_ < end_) {
410 return raw_data_[pos_++]; 193 return raw_data_[pos_++];
411 } else { 194 } else {
412 // note: currently the following increment is necessary to avoid a 195 // note: currently the following increment is necessary to avoid a
413 // test-parser problem! 196 // test-parser problem!
(...skipping 12 matching lines...) Expand all
426 209
427 210
428 template <typename StringType, typename CharType> 211 template <typename StringType, typename CharType>
429 void ExternalStringUTF16Buffer<StringType, CharType>::SeekForward(int pos) { 212 void ExternalStringUTF16Buffer<StringType, CharType>::SeekForward(int pos) {
430 pos_ = pos; 213 pos_ = pos;
431 } 214 }
432 215
433 } } // namespace v8::internal 216 } } // namespace v8::internal
434 217
435 #endif // V8_SCANNER_H_ 218 #endif // V8_SCANNER_H_
OLDNEW
« no previous file with comments | « src/prescanner.h ('k') | src/scanner.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698