src/scanner-base.h - Issue 5136002: Extract scanner base/JS/JSON and move base and JS to scanner-base.

Side by Side Diff: src/scanner-base.h

Issue 5136002: Extract scanner base/JS/JSON and move base and JS to scanner-base. (Closed)

Patch Set: Created 10 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2010 the V8 project authors. All rights reserved.	1 // Copyright 2010 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 19 matching lines...) Expand all Loading...
30 #ifndef V8_SCANNER_BASE_H_	30 #ifndef V8_SCANNER_BASE_H_

31 #define V8_SCANNER_BASE_H_	31 #define V8_SCANNER_BASE_H_

32	32

33 #include "globals.h"	33 #include "globals.h"

34 #include "checks.h"	34 #include "checks.h"

35 #include "allocation.h"	35 #include "allocation.h"

36 #include "token.h"	36 #include "token.h"

37 #include "unicode-inl.h"	37 #include "unicode-inl.h"

38 #include "char-predicates.h"	38 #include "char-predicates.h"

39 #include "utils.h"	39 #include "utils.h"

	40 #include "list-inl.h"

40	41

41 namespace v8 {	42 namespace v8 {

42 namespace internal {	43 namespace internal {

43	44

44 // Interface through which the scanner reads characters from the input source.	45 // Returns the value (0 .. 15) of a hexadecimal character c.

	46 // If c is not a legal hexadecimal character, returns a value < 0.

	47 inline int HexValue(uc32 c) {

	48 c -= '0';

	49 if (static_cast<unsigned>(c) <= 9) return c;

	50 c = (c \| 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.

	51 if (static_cast<unsigned>(c) <= 6) return c + 10;

	52 return -1;

	53 }

	54

	55 // ----------------------------------------------------------------------------

	56 // UTF16Buffer - scanner input source with pushback.

	57

45 class UTF16Buffer {	58 class UTF16Buffer {

46 public:	59 public:

47 UTF16Buffer();	60 UTF16Buffer();

48 virtual ~UTF16Buffer() {}	61 virtual ~UTF16Buffer() {}

49	62

50 virtual void PushBack(uc32 ch) = 0;	63 virtual void PushBack(uc32 ch) = 0;

51 // Returns a value < 0 when the buffer end is reached.	64 // Returns a value < 0 when the buffer end is reached.

52 virtual uc32 Advance() = 0;	65 virtual uc32 Advance() = 0;

53 virtual void SeekForward(int pos) = 0;	66 virtual void SeekForward(int pos) = 0;

54	67

55 int pos() const { return pos_; }	68 int pos() const { return pos_; }

56	69

	70 static const int kNoEndPosition = 1;

	71

57 protected:	72 protected:

	73 // Initial value of end_ before the input stream is initialized.

	74

58 int pos_; // Current position in the buffer.	75 int pos_; // Current position in the buffer.

59 int end_; // Position where scanning should stop (EOF).	76 int end_; // Position where scanning should stop (EOF).

60 };	77 };

61	78

62	79

63 class ScannerConstants : AllStatic {	80 class ScannerConstants : AllStatic {

64 public:	81 public:

65 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;	82 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;

66	83

67 static StaticResource<Utf8Decoder>* utf8_decoder() {	84 static StaticResource<Utf8Decoder>* utf8_decoder() {

68 return &utf8_decoder_;	85 return &utf8_decoder_;

69 }	86 }

70	87

71 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;	88 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;

72 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;	89 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;

73 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;	90 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;

74 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;	91 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;

75	92

76 static bool IsIdentifier(unibrow::CharacterStream* buffer);	93 static bool IsIdentifier(unibrow::CharacterStream* buffer);

77	94

78 private:	95 private:

79 static StaticResource<Utf8Decoder> utf8_decoder_;	96 static StaticResource<Utf8Decoder> utf8_decoder_;

80 };	97 };

81	98

	99 // ----------------------------------------------------------------------------

	100 // LiteralCollector - Collector of chars of literals.

	101

	102 class LiteralCollector {

	103 public:

	104 LiteralCollector();

	105 ~LiteralCollector();

	106

	107 inline void AddChar(uc32 c) {

	108 if (recording_) {

	109 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {

	110 buffer_.Add(static_cast<char>(c));

	111 } else {

	112 AddCharSlow(c);

	113 }

	114 }

	115 }

	116

	117 void StartLiteral() {

	118 buffer_.StartSequence();

	119 recording_ = true;

	120 }

	121

	122 Vector<const char> EndLiteral() {

	123 if (recording_) {

	124 recording_ = false;

	125 buffer_.Add(kEndMarker);

	126 Vector<char> sequence = buffer_.EndSequence();

	127 return Vector<const char>(sequence.start(), sequence.length());

	128 }

	129 return Vector<const char>();

	130 }

	131

	132 void DropLiteral() {

	133 if (recording_) {

	134 recording_ = false;

	135 buffer_.DropSequence();

	136 }

	137 }

	138

	139 void Reset() {

	140 buffer_.Reset();

	141 }

	142

	143 // The end marker added after a parsed literal.

	144 // Using zero allows the usage of strlen and similar functions on

	145 // identifiers and numbers (but not strings, since they may contain zero

	146 // bytes).

	147 static const char kEndMarker = '\x00';

	148 private:

	149 static const int kInitialCapacity = 256;

	150 SequenceCollector<char, 4> buffer_;

	151 bool recording_;

	152 void AddCharSlow(uc32 c);

	153 };

	154

	155 // ----------------------------------------------------------------------------

	156 // Scanner base-class.

	157

	158 // Generic functionality used by both JSON and JavaScript scanners.

	159 class Scanner {

	160 public:

	161 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;

	162

	163 class LiteralScope {

	164 public:

	165 explicit LiteralScope(Scanner* self);

	166 ~LiteralScope();

	167 void Complete();

	168

	169 private:

	170 Scanner* scanner_;

	171 bool complete_;

	172 };

	173

	174 Scanner();

	175

	176 // Returns the current token again.

	177 Token::Value current_token() { return current_.token; }

	178

	179 // One token look-ahead (past the token returned by Next()).

	180 Token::Value peek() const { return next_.token; }

	181

	182 struct Location {

	183 Location(int b, int e) : beg_pos(b), end_pos(e) { }

	184 Location() : beg_pos(0), end_pos(0) { }

	185 int beg_pos;

	186 int end_pos;

	187 };

	188

	189 // Returns the location information for the current token

	190 // (the token returned by Next()).

	191 Location location() const { return current_.location; }

	192 Location peek_location() const { return next_.location; }

	193

	194 // Returns the literal string, if any, for the current token (the

	195 // token returned by Next()). The string is 0-terminated and in

	196 // UTF-8 format; they may contain 0-characters. Literal strings are

	197 // collected for identifiers, strings, and numbers.

	198 // These functions only give the correct result if the literal

	199 // was scanned between calls to StartLiteral() and TerminateLiteral().

	200 const char* literal_string() const {

	201 return current_.literal_chars.start();

	202 }

	203

	204 int literal_length() const {

	205 // Excluding terminal '\x00' added by TerminateLiteral().

	206 return current_.literal_chars.length() - 1;

	207 }

	208

	209 Vector<const char> literal() const {

	210 return Vector<const char>(literal_string(), literal_length());

	211 }

	212

	213 // Returns the literal string for the next token (the token that

	214 // would be returned if Next() were called).

	215 const char* next_literal_string() const {

	216 return next_.literal_chars.start();

	217 }

	218

	219

	220 // Returns the length of the next token (that would be returned if

	221 // Next() were called).

	222 int next_literal_length() const {

	223 // Excluding terminal '\x00' added by TerminateLiteral().

	224 return next_.literal_chars.length() - 1;

	225 }

	226

	227 Vector<const char> next_literal() const {

	228 return Vector<const char>(next_literal_string(), next_literal_length());

	229 }

	230

	231 bool stack_overflow() { return stack_overflow_; }

	232

	233 static const int kCharacterLookaheadBufferSize = 1;

	234

	235 protected:

	236 // The current and look-ahead token.

	237 struct TokenDesc {

	238 Token::Value token;

	239 Location location;

	240 Vector<const char> literal_chars;

	241 };

	242

	243 // Call this after setting source_ to the input.

	244 void Init() {

	245 // Set c0_ (one character ahead)

	246 ASSERT(kCharacterLookaheadBufferSize == 1);

	247 Advance();

	248 // Initialize current_ to not refer to a literal.

	249 current_.literal_chars = Vector<const char>();

	250 // Reset literal buffer.

	251 literal_buffer_.Reset();

	252 }

	253

	254 // Literal buffer support

	255 inline void StartLiteral() {

	256 literal_buffer_.StartLiteral();

	257 }

	258

	259 inline void AddLiteralChar(uc32 c) {

	260 literal_buffer_.AddChar(c);

	261 }

	262

	263 // Complete scanning of a literal.

	264 inline void TerminateLiteral() {

	265 next_.literal_chars = literal_buffer_.EndLiteral();

	266 }

	267

	268 // Stops scanning of a literal and drop the collected characters,

	269 // e.g., due to an encountered error.

	270 inline void DropLiteral() {

	271 literal_buffer_.DropLiteral();

	272 }

	273

	274 inline void AddLiteralCharAdvance() {

	275 AddLiteralChar(c0_);

	276 Advance();

	277 }

	278

	279 // Low-level scanning support.

	280 void Advance() { c0_ = source_->Advance(); }

	281 void PushBack(uc32 ch) {

	282 source_->PushBack(ch);

	283 c0_ = ch;

	284 }

	285

	286 inline Token::Value Select(Token::Value tok) {

	287 Advance();

	288 return tok;

	289 }

	290

	291 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {

	292 Advance();

	293 if (c0_ == next) {

	294 Advance();

	295 return then;

	296 } else {

	297 return else_;

	298 }

	299 }

	300

	301 uc32 ScanHexEscape(uc32 c, int length);

	302 uc32 ScanOctalEscape(uc32 c, int length);

	303

	304 // Return the current source position.

	305 int source_pos() {

	306 return source_->pos() - kCharacterLookaheadBufferSize;

	307 }

	308

	309 TokenDesc current_; // desc for current token (as returned by Next())

	310 TokenDesc next_; // desc for next token (one token look-ahead)

	311

	312 // Input stream. Must be initialized to an UTF16Buffer.

	313 UTF16Buffer* source_;

	314

	315 // Buffer to hold literal values (identifiers, strings, numbers)

	316 // using '\x00'-terminated UTF-8 encoding. Handles allocation internally.

	317 LiteralCollector literal_buffer_;

	318

	319 bool stack_overflow_;

	320

	321 // One Unicode character look-ahead; c0_ < 0 at the end of the input.

	322 uc32 c0_;

	323 };

	324

	325 // ----------------------------------------------------------------------------

	326 // JavaScriptScanner - base logic for JavaScript scanning.

	327

	328 class JavaScriptScanner : public Scanner {

	329 public:

	330 JavaScriptScanner();

	331

	332 // Returns the next token.

	333 Token::Value Next();

	334

	335 // Returns true if there was a line terminator before the peek'ed token.

	336 bool has_line_terminator_before_next() const {

	337 return has_line_terminator_before_next_;

	338 }

	339

	340 // Scans the input as a regular expression pattern, previous

	341 // character(s) must be /(=). Returns true if a pattern is scanned.

	342 bool ScanRegExpPattern(bool seen_equal);

	343 // Returns true if regexp flags are scanned (always since flags can

	344 // be empty).

	345 bool ScanRegExpFlags();

	346

	347 // Tells whether the buffer contains an identifier (no escapes).

	348 // Used for checking if a property name is an identifier.

	349 static bool IsIdentifier(unibrow::CharacterStream* buffer);

	350

	351 // Seek forward to the given position. This operation does not

	352 // work in general, for instance when there are pushed back

	353 // characters, but works for seeking forward until simple delimiter

	354 // tokens, which is what it is used for.

	355 void SeekForward(int pos);

	356

	357 protected:

	358 bool SkipWhiteSpace();

	359 Token::Value SkipSingleLineComment();

	360 Token::Value SkipMultiLineComment();

	361

	362 // Scans a single JavaScript token.

	363 void Scan();

	364

	365 void ScanDecimalDigits();

	366 Token::Value ScanNumber(bool seen_period);

	367 Token::Value ScanIdentifier();

	368

	369 void ScanEscape();

	370 Token::Value ScanString();

	371

	372 // Scans a possible HTML comment -- begins with '<!'.

	373 Token::Value ScanHtmlComment();

	374

	375 // Decodes a unicode escape-sequence which is part of an identifier.

	376 // If the escape sequence cannot be decoded the result is kBadChar.

	377 uc32 ScanIdentifierUnicodeEscape();

	378

	379 bool has_line_terminator_before_next_;

	380 };

	381

	382

	383 // ----------------------------------------------------------------------------

	384 // Keyword matching state machine.

82	385

83 class KeywordMatcher {	386 class KeywordMatcher {

84 // Incrementally recognize keywords.	387 // Incrementally recognize keywords.

85 //	388 //

86 // Recognized keywords:	389 // Recognized keywords:

87 // break case catch const* continue debugger* default delete do else	390 // break case catch const* continue debugger* default delete do else

88 // finally false for function if in instanceof native* new null	391 // finally false for function if in instanceof native* new null

89 // return switch this throw true try typeof var void while with	392 // return switch this throw true try typeof var void while with

90 //	393 //

91 // *: Actually "future reserved keywords". These are the only ones we	394 // *: Actually "future reserved keywords". These are the only ones we

(...skipping 105 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
197 // keyword with the current prefix).	500 // keyword with the current prefix).

198 const char* keyword_;	501 const char* keyword_;

199 int counter_;	502 int counter_;

200 Token::Value keyword_token_;	503 Token::Value keyword_token_;

201 };	504 };

202	505

203	506

204 } } // namespace v8::internal	507 } } // namespace v8::internal

205	508

206 #endif // V8_SCANNER_BASE_H_	509 #endif // V8_SCANNER_BASE_H_

OLD	NEW

« no previous file with comments | « src/scanner.cc ('k') | src/scanner-base.cc » ('j') | no next file with comments »