src/scanner.h - Issue 5136002: Extract scanner base/JS/JSON and move base and JS to scanner-base.

Side by Side Diff: src/scanner.h

Issue 5136002: Extract scanner base/JS/JSON and move base and JS to scanner-base. (Closed)

Patch Set: Created 10 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2010 the V8 project authors. All rights reserved.	1 // Copyright 2010 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 17 matching lines...) Expand all Loading...
28 #ifndef V8_SCANNER_H_	28 #ifndef V8_SCANNER_H_

29 #define V8_SCANNER_H_	29 #define V8_SCANNER_H_

30	30

31 #include "token.h"	31 #include "token.h"

32 #include "char-predicates-inl.h"	32 #include "char-predicates-inl.h"

33 #include "scanner-base.h"	33 #include "scanner-base.h"

34	34

35 namespace v8 {	35 namespace v8 {

36 namespace internal {	36 namespace internal {

37	37

38

39 class UTF8Buffer {

40 public:

41 UTF8Buffer();

42 ~UTF8Buffer();

43

44 inline void AddChar(uc32 c) {

45 if (recording_) {

46 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {

47 buffer_.Add(static_cast<char>(c));

48 } else {

49 AddCharSlow(c);

50 }

51 }

52 }

53

54 void StartLiteral() {

55 buffer_.StartSequence();

56 recording_ = true;

57 }

58

59 Vector<const char> EndLiteral() {

60 if (recording_) {

61 recording_ = false;

62 buffer_.Add(kEndMarker);

63 Vector<char> sequence = buffer_.EndSequence();

64 return Vector<const char>(sequence.start(), sequence.length());

65 }

66 return Vector<const char>();

67 }

68

69 void DropLiteral() {

70 if (recording_) {

71 recording_ = false;

72 buffer_.DropSequence();

73 }

74 }

75

76 void Reset() {

77 buffer_.Reset();

78 }

79

80 // The end marker added after a parsed literal.

81 // Using zero allows the usage of strlen and similar functions on

82 // identifiers and numbers (but not strings, since they may contain zero

83 // bytes).

84 // TODO(lrn): Use '\xff' as end marker, since it cannot occur inside

85 // an utf-8 string. This requires changes in all places that uses

86 // str-functions on the literals, but allows a single pointer to represent

87 // the literal, even if it contains embedded zeros.

88 static const char kEndMarker = '\x00';

89 private:

90 static const int kInitialCapacity = 256;

91 SequenceCollector<char, 4> buffer_;

92 bool recording_;

93 void AddCharSlow(uc32 c);

94 };

95

96

97 // UTF16 buffer to read characters from a character stream.	38 // UTF16 buffer to read characters from a character stream.

98 class CharacterStreamUTF16Buffer: public UTF16Buffer {	39 class CharacterStreamUTF16Buffer: public UTF16Buffer {

99 public:	40 public:

100 CharacterStreamUTF16Buffer();	41 CharacterStreamUTF16Buffer();

101 virtual ~CharacterStreamUTF16Buffer() {}	42 virtual ~CharacterStreamUTF16Buffer() {}

102 void Initialize(Handle<String> data,	43 void Initialize(Handle<String> data,

103 unibrow::CharacterStream* stream,	44 unibrow::CharacterStream* stream,

104 int start_position,	45 int start_position,

105 int end_position);	46 int end_position);

106 virtual void PushBack(uc32 ch);	47 virtual void PushBack(uc32 ch);

(...skipping 20 matching lines...) Expand all Loading...
127 int end_position);	68 int end_position);

128 virtual void PushBack(uc32 ch);	69 virtual void PushBack(uc32 ch);

129 virtual uc32 Advance();	70 virtual uc32 Advance();

130 virtual void SeekForward(int pos);	71 virtual void SeekForward(int pos);

131	72

132 private:	73 private:

133 const CharType* raw_data_; // Pointer to the actual array of characters.	74 const CharType* raw_data_; // Pointer to the actual array of characters.

134 };	75 };

135	76

136	77

137 enum ParserLanguage { JAVASCRIPT, JSON };	78 // Initializes a UTF16Buffer as input stream, using one of a number

	79 // of strategies depending on the available character sources.

	80 class StreamInitializer {

	81 public:

	82 UTF16Buffer* Init(Handle<String> source,

	83 unibrow::CharacterStream* stream,

	84 int start_position,

	85 int end_position);

	86 private:

	87 // Different UTF16 buffers used to pull characters from. Based on input one of

	88 // these will be initialized as the actual data source.

	89 CharacterStreamUTF16Buffer char_stream_buffer_;

	90 ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t>

	91 two_byte_string_buffer_;

	92 ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_;

	93

	94 // Used to convert the source string into a character stream when a stream

	95 // is not passed to the scanner.

	96 SafeStringInputBuffer safe_string_input_buffer_;

	97 };

	98

	99 // ----------------------------------------------------------------------------

	100 // V8JavaScriptScanner

	101 // JavaScript scanner getting its input from either a V8 String or a unicode

	102 // CharacterStream.

	103

	104 class V8JavaScriptScanner : public JavaScriptScanner {

	105 public:

	106 V8JavaScriptScanner() {}

	107

	108 Token::Value NextCheckStack();

	109

	110 // Initialize the Scanner to scan source.

	111 void Initialize(Handle<String> source);

	112 void Initialize(Handle<String> source,

	113 unibrow::CharacterStream* stream);

	114 void Initialize(Handle<String> source,

	115 int start_position, int end_position);

	116

	117 protected:

	118 StreamInitializer stream_initializer_;

	119 };

138	120

139	121

140 class Scanner {	122 class JsonScanner : public Scanner {

141 public:	123 public:

142 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;	124 JsonScanner();

143

144 class LiteralScope {

145 public:

146 explicit LiteralScope(Scanner* self);

147 ~LiteralScope();

148 void Complete();

149

150 private:

151 Scanner* scanner_;

152 bool complete_;

153 };

154

155 Scanner();

156	125

157 // Initialize the Scanner to scan source.	126 // Initialize the Scanner to scan source.

158 void Initialize(Handle<String> source,	127 void Initialize(Handle<String> source);

159 ParserLanguage language);

160 void Initialize(Handle<String> source,

161 unibrow::CharacterStream* stream,

162 ParserLanguage language);

163 void Initialize(Handle<String> source,

164 int start_position, int end_position,

165 ParserLanguage language);

166	128

167 // Returns the next token.	129 // Returns the next token.

168 Token::Value Next();	130 Token::Value Next();

169	131

170 // Returns the current token again.	132 protected:

171 Token::Value current_token() { return current_.token; }	133 // Skip past JSON whitespace (only space, tab, newline and carrige-return).

172

173 // One token look-ahead (past the token returned by Next()).

174 Token::Value peek() const { return next_.token; }

175

176 // Returns true if there was a line terminator before the peek'ed token.

177 bool has_line_terminator_before_next() const {

178 return has_line_terminator_before_next_;

179 }

180

181 struct Location {

182 Location(int b, int e) : beg_pos(b), end_pos(e) { }

183 Location() : beg_pos(0), end_pos(0) { }

184 int beg_pos;

185 int end_pos;

186 };

187

188 // Returns the location information for the current token

189 // (the token returned by Next()).

190 Location location() const { return current_.location; }

191 Location peek_location() const { return next_.location; }

192

193 // Returns the literal string, if any, for the current token (the

194 // token returned by Next()). The string is 0-terminated and in

195 // UTF-8 format; they may contain 0-characters. Literal strings are

196 // collected for identifiers, strings, and numbers.

197 // These functions only give the correct result if the literal

198 // was scanned between calls to StartLiteral() and TerminateLiteral().

199 const char* literal_string() const {

200 return current_.literal_chars.start();

201 }

202

203 int literal_length() const {

204 // Excluding terminal '\x00' added by TerminateLiteral().

205 return current_.literal_chars.length() - 1;

206 }

207

208 Vector<const char> literal() const {

209 return Vector<const char>(literal_string(), literal_length());

210 }

211

212 // Returns the literal string for the next token (the token that

213 // would be returned if Next() were called).

214 const char* next_literal_string() const {

215 return next_.literal_chars.start();

216 }

217

218

219 // Returns the length of the next token (that would be returned if

220 // Next() were called).

221 int next_literal_length() const {

222 // Excluding terminal '\x00' added by TerminateLiteral().

223 return next_.literal_chars.length() - 1;

224 }

225

226 Vector<const char> next_literal() const {

227 return Vector<const char>(next_literal_string(), next_literal_length());

228 }

229

230 // Scans the input as a regular expression pattern, previous

231 // character(s) must be /(=). Returns true if a pattern is scanned.

232 bool ScanRegExpPattern(bool seen_equal);

233 // Returns true if regexp flags are scanned (always since flags can

234 // be empty).

235 bool ScanRegExpFlags();

236

237 // Seek forward to the given position. This operation does not

238 // work in general, for instance when there are pushed back

239 // characters, but works for seeking forward until simple delimiter

240 // tokens, which is what it is used for.

241 void SeekForward(int pos);

242

243 bool stack_overflow() { return stack_overflow_; }

244

245 // Tells whether the buffer contains an identifier (no escapes).

246 // Used for checking if a property name is an identifier.

247 static bool IsIdentifier(unibrow::CharacterStream* buffer);

248

249 static const int kCharacterLookaheadBufferSize = 1;

250 static const int kNoEndPosition = 1;

251

252 private:

253 // The current and look-ahead token.

254 struct TokenDesc {

255 Token::Value token;

256 Location location;

257 Vector<const char> literal_chars;

258 };

259

260 void Init(Handle<String> source,

261 unibrow::CharacterStream* stream,

262 int start_position, int end_position,

263 ParserLanguage language);

264

265 // Literal buffer support

266 inline void StartLiteral();

267 inline void AddLiteralChar(uc32 ch);

268 inline void AddLiteralCharAdvance();

269 inline void TerminateLiteral();

270 // Stops scanning of a literal, e.g., due to an encountered error.

271 inline void DropLiteral();

272

273 // Low-level scanning support.

274 void Advance() { c0_ = source_->Advance(); }

275 void PushBack(uc32 ch) {

276 source_->PushBack(ch);

277 c0_ = ch;

278 }

279

280 bool SkipWhiteSpace() {

281 if (is_parsing_json_) {

282 return SkipJsonWhiteSpace();

283 } else {

284 return SkipJavaScriptWhiteSpace();

285 }

286 }

287

288 bool SkipJavaScriptWhiteSpace();

289 bool SkipJsonWhiteSpace();	134 bool SkipJsonWhiteSpace();

290 Token::Value SkipSingleLineComment();

291 Token::Value SkipMultiLineComment();

292

293 inline Token::Value Select(Token::Value tok);

294 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_);

295

296 inline void Scan() {

297 if (is_parsing_json_) {

298 ScanJson();

299 } else {

300 ScanJavaScript();

301 }

302 }

303

304 // Scans a single JavaScript token.

305 void ScanJavaScript();

306	135

307 // Scan a single JSON token. The JSON lexical grammar is specified in the	136 // Scan a single JSON token. The JSON lexical grammar is specified in the

308 // ECMAScript 5 standard, section 15.12.1.1.	137 // ECMAScript 5 standard, section 15.12.1.1.

309 // Recognizes all of the single-character tokens directly, or calls a function	138 // Recognizes all of the single-character tokens directly, or calls a function

310 // to scan a number, string or identifier literal.	139 // to scan a number, string or identifier literal.

311 // The only allowed whitespace characters between tokens are tab,	140 // The only allowed whitespace characters between tokens are tab,

312 // carrige-return, newline and space.	141 // carrige-return, newline and space.

313 void ScanJson();	142 void ScanJson();

314	143

315 // A JSON number (production JSONNumber) is a subset of the valid JavaScript	144 // A JSON number (production JSONNumber) is a subset of the valid JavaScript

316 // decimal number literals.	145 // decimal number literals.

317 // It includes an optional minus sign, must have at least one	146 // It includes an optional minus sign, must have at least one

318 // digit before and after a decimal point, may not have prefixed zeros (unless	147 // digit before and after a decimal point, may not have prefixed zeros (unless

319 // the integer part is zero), and may include an exponent part (e.g., "e-10").	148 // the integer part is zero), and may include an exponent part (e.g., "e-10").

320 // Hexadecimal and octal numbers are not allowed.	149 // Hexadecimal and octal numbers are not allowed.

321 Token::Value ScanJsonNumber();	150 Token::Value ScanJsonNumber();

322	151

323 // A JSON string (production JSONString) is subset of valid JavaScript string	152 // A JSON string (production JSONString) is subset of valid JavaScript string

324 // literals. The string must only be double-quoted (not single-quoted), and	153 // literals. The string must only be double-quoted (not single-quoted), and

325 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and	154 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and

326 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.	155 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.

327 Token::Value ScanJsonString();	156 Token::Value ScanJsonString();

328	157

329 // Used to recognizes one of the literals "true", "false", or "null". These	158 // Used to recognizes one of the literals "true", "false", or "null". These

330 // are the only valid JSON identifiers (productions JSONBooleanLiteral,	159 // are the only valid JSON identifiers (productions JSONBooleanLiteral,

331 // JSONNullLiteral).	160 // JSONNullLiteral).

332 Token::Value ScanJsonIdentifier(const char* text, Token::Value token);	161 Token::Value ScanJsonIdentifier(const char* text, Token::Value token);

333	162

334 void ScanDecimalDigits();	163 StreamInitializer stream_initializer_;

335 Token::Value ScanNumber(bool seen_period);

336 Token::Value ScanIdentifier();

337 uc32 ScanHexEscape(uc32 c, int length);

338 uc32 ScanOctalEscape(uc32 c, int length);

339 void ScanEscape();

340 Token::Value ScanString();

341

342 // Scans a possible HTML comment -- begins with '<!'.

343 Token::Value ScanHtmlComment();

344

345 // Return the current source position.

346 int source_pos() {

347 return source_->pos() - kCharacterLookaheadBufferSize;

348 }

349

350 // Decodes a unicode escape-sequence which is part of an identifier.

351 // If the escape sequence cannot be decoded the result is kBadRune.

352 uc32 ScanIdentifierUnicodeEscape();

353

354 TokenDesc current_; // desc for current token (as returned by Next())

355 TokenDesc next_; // desc for next token (one token look-ahead)

356 bool has_line_terminator_before_next_;

357 bool is_parsing_json_;

358

359 // Different UTF16 buffers used to pull characters from. Based on input one of

360 // these will be initialized as the actual data source.

361 CharacterStreamUTF16Buffer char_stream_buffer_;

362 ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t>

363 two_byte_string_buffer_;

364 ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_;

365

366 // Source. Will point to one of the buffers declared above.

367 UTF16Buffer* source_;

368

369 // Used to convert the source string into a character stream when a stream

370 // is not passed to the scanner.

371 SafeStringInputBuffer safe_string_input_buffer_;

372

373 // Buffer to hold literal values (identifiers, strings, numbers)

374 // using '\x00'-terminated UTF-8 encoding. Handles allocation internally.

375 UTF8Buffer literal_buffer_;

376

377 bool stack_overflow_;

378

379 // One Unicode character look-ahead; c0_ < 0 at the end of the input.

380 uc32 c0_;

381 };	164 };

382	165

383	166

384 // ExternalStringUTF16Buffer	167 // ExternalStringUTF16Buffer

385 template <typename StringType, typename CharType>	168 template <typename StringType, typename CharType>

386 ExternalStringUTF16Buffer<StringType, CharType>::ExternalStringUTF16Buffer()	169 ExternalStringUTF16Buffer<StringType, CharType>::ExternalStringUTF16Buffer()

387 : raw_data_(NULL) { }	170 : raw_data_(NULL) { }

388	171

389	172

390 template <typename StringType, typename CharType>	173 template <typename StringType, typename CharType>

391 void ExternalStringUTF16Buffer<StringType, CharType>::Initialize(	174 void ExternalStringUTF16Buffer<StringType, CharType>::Initialize(

392 Handle<StringType> data,	175 Handle<StringType> data,

393 int start_position,	176 int start_position,

394 int end_position) {	177 int end_position) {

395 ASSERT(!data.is_null());	178 ASSERT(!data.is_null());

396 raw_data_ = data->resource()->data();	179 raw_data_ = data->resource()->data();

397	180

398 ASSERT(end_position <= data->length());	181 ASSERT(end_position <= data->length());

399 if (start_position > 0) {	182 if (start_position > 0) {

400 SeekForward(start_position);	183 SeekForward(start_position);

401 }	184 }

402 end_ =	185 end_ =

403 end_position != Scanner::kNoEndPosition ? end_position : data->length();	186 end_position != kNoEndPosition ? end_position : data->length();

404 }	187 }

405	188

406	189

407 template <typename StringType, typename CharType>	190 template <typename StringType, typename CharType>

408 uc32 ExternalStringUTF16Buffer<StringType, CharType>::Advance() {	191 uc32 ExternalStringUTF16Buffer<StringType, CharType>::Advance() {

409 if (pos_ < end_) {	192 if (pos_ < end_) {

410 return raw_data_[pos_++];	193 return raw_data_[pos_++];

411 } else {	194 } else {

412 // note: currently the following increment is necessary to avoid a	195 // note: currently the following increment is necessary to avoid a

413 // test-parser problem!	196 // test-parser problem!

(...skipping 12 matching lines...) Expand all Loading...
426	209

427	210

428 template <typename StringType, typename CharType>	211 template <typename StringType, typename CharType>

429 void ExternalStringUTF16Buffer<StringType, CharType>::SeekForward(int pos) {	212 void ExternalStringUTF16Buffer<StringType, CharType>::SeekForward(int pos) {

430 pos_ = pos;	213 pos_ = pos;

431 }	214 }

432	215

433 } } // namespace v8::internal	216 } } // namespace v8::internal

434	217

435 #endif // V8_SCANNER_H_	218 #endif // V8_SCANNER_H_

OLD	NEW

« no previous file with comments | « src/prescanner.h ('k') | src/scanner.cc » ('j') | no next file with comments »