src/scanner-base.h - Issue 7739020: Rename scanner.* to scanner-character-streams.* and scanner-base.* to scanner.*

Side by Side Diff: src/scanner-base.h

Issue 7739020: Rename scanner.* to scanner-character-streams.* and scanner-base.* to scanner.* (Closed) Base URL: git://github.com/v8/v8.git@master

Patch Set: rename scanner-base.* to scanner.* Created 9 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright 2011 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are

4 // met:

5 //

6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided

11 // with the distribution.

12 // * Neither the name of Google Inc. nor the names of its

13 // contributors may be used to endorse or promote products derived

14 // from this software without specific prior written permission.

15 //

16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

27

28 // Features shared by parsing and pre-parsing scanners.

29

30 #ifndef V8_SCANNER_BASE_H_

31 #define V8_SCANNER_BASE_H_

32

33 #include "allocation.h"

34 #include "char-predicates.h"

35 #include "checks.h"

36 #include "globals.h"

37 #include "token.h"

38 #include "unicode-inl.h"

39 #include "utils.h"

40

41 namespace v8 {

42 namespace internal {

43

44 // Returns the value (0 .. 15) of a hexadecimal character c.

45 // If c is not a legal hexadecimal character, returns a value < 0.

46 inline int HexValue(uc32 c) {

47 c -= '0';

48 if (static_cast<unsigned>(c) <= 9) return c;

49 c = (c \| 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.

50 if (static_cast<unsigned>(c) <= 5) return c + 10;

51 return -1;

52 }

53

54

55 // ---------------------------------------------------------------------

56 // Buffered stream of characters, using an internal UC16 buffer.

57

58 class UC16CharacterStream {

59 public:

60 UC16CharacterStream() : pos_(0) { }

61 virtual ~UC16CharacterStream() { }

62

63 // Returns and advances past the next UC16 character in the input

64 // stream. If there are no more characters, it returns a negative

65 // value.

66 inline uc32 Advance() {

67 if (buffer_cursor_ < buffer_end_ \|\| ReadBlock()) {

68 pos_++;

69 return static_cast<uc32>(*(buffer_cursor_++));

70 }

71 // Note: currently the following increment is necessary to avoid a

72 // parser problem! The scanner treats the final kEndOfInput as

73 // a character with a position, and does math relative to that

74 // position.

75 pos_++;

76

77 return kEndOfInput;

78 }

79

80 // Return the current position in the character stream.

81 // Starts at zero.

82 inline unsigned pos() const { return pos_; }

83

84 // Skips forward past the next character_count UC16 characters

85 // in the input, or until the end of input if that comes sooner.

86 // Returns the number of characters actually skipped. If less

87 // than character_count,

88 inline unsigned SeekForward(unsigned character_count) {

89 unsigned buffered_chars =

90 static_cast<unsigned>(buffer_end_ - buffer_cursor_);

91 if (character_count <= buffered_chars) {

92 buffer_cursor_ += character_count;

93 pos_ += character_count;

94 return character_count;

95 }

96 return SlowSeekForward(character_count);

97 }

98

99 // Pushes back the most recently read UC16 character (or negative

100 // value if at end of input), i.e., the value returned by the most recent

101 // call to Advance.

102 // Must not be used right after calling SeekForward.

103 virtual void PushBack(int32_t character) = 0;

104

105 protected:

106 static const uc32 kEndOfInput = -1;

107

108 // Ensures that the buffer_cursor_ points to the character at

109 // position pos_ of the input, if possible. If the position

110 // is at or after the end of the input, return false. If there

111 // are more characters available, return true.

112 virtual bool ReadBlock() = 0;

113 virtual unsigned SlowSeekForward(unsigned character_count) = 0;

114

115 const uc16* buffer_cursor_;

116 const uc16* buffer_end_;

117 unsigned pos_;

118 };

119

120

121 class UnicodeCache {

122 // ---------------------------------------------------------------------

123 // Caching predicates used by scanners.

124 public:

125 UnicodeCache() {}

126 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;

127

128 StaticResource<Utf8Decoder>* utf8_decoder() {

129 return &utf8_decoder_;

130 }

131

132 bool IsIdentifierStart(unibrow::uchar c) { return kIsIdentifierStart.get(c); }

133 bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); }

134 bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); }

135 bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); }

136

137 private:

138

139 unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;

140 unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;

141 unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;

142 unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;

143 StaticResource<Utf8Decoder> utf8_decoder_;

144

145 DISALLOW_COPY_AND_ASSIGN(UnicodeCache);

146 };

147

148

149 // ----------------------------------------------------------------------------

150 // LiteralBuffer - Collector of chars of literals.

151

152 class LiteralBuffer {

153 public:

154 LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }

155

156 ~LiteralBuffer() {

157 if (backing_store_.length() > 0) {

158 backing_store_.Dispose();

159 }

160 }

161

162 inline void AddChar(uc16 character) {

163 if (position_ >= backing_store_.length()) ExpandBuffer();

164 if (is_ascii_) {

165 if (character < kMaxAsciiCharCodeU) {

166 backing_store_[position_] = static_cast<byte>(character);

167 position_ += kASCIISize;

168 return;

169 }

170 ConvertToUC16();

171 }

172 reinterpret_cast<uc16>(&backing_store_[position_]) = character;

173 position_ += kUC16Size;

174 }

175

176 bool is_ascii() { return is_ascii_; }

177

178 Vector<const uc16> uc16_literal() {

179 ASSERT(!is_ascii_);

180 ASSERT((position_ & 0x1) == 0);

181 return Vector<const uc16>(

182 reinterpret_cast<const uc16*>(backing_store_.start()),

183 position_ >> 1);

184 }

185

186 Vector<const char> ascii_literal() {

187 ASSERT(is_ascii_);

188 return Vector<const char>(

189 reinterpret_cast<const char*>(backing_store_.start()),

190 position_);

191 }

192

193 int length() {

194 return is_ascii_ ? position_ : (position_ >> 1);

195 }

196

197 void Reset() {

198 position_ = 0;

199 is_ascii_ = true;

200 }

201 private:

202 static const int kInitialCapacity = 16;

203 static const int kGrowthFactory = 4;

204 static const int kMinConversionSlack = 256;

205 static const int kMaxGrowth = 1 * MB;

206 inline int NewCapacity(int min_capacity) {

207 int capacity = Max(min_capacity, backing_store_.length());

208 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);

209 return new_capacity;

210 }

211

212 void ExpandBuffer() {

213 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));

214 memcpy(new_store.start(), backing_store_.start(), position_);

215 backing_store_.Dispose();

216 backing_store_ = new_store;

217 }

218

219 void ConvertToUC16() {

220 ASSERT(is_ascii_);

221 Vector<byte> new_store;

222 int new_content_size = position_ * kUC16Size;

223 if (new_content_size >= backing_store_.length()) {

224 // Ensure room for all currently read characters as UC16 as well

225 // as the character about to be stored.

226 new_store = Vector<byte>::New(NewCapacity(new_content_size));

227 } else {

228 new_store = backing_store_;

229 }

230 char* src = reinterpret_cast<char*>(backing_store_.start());

231 uc16* dst = reinterpret_cast<uc16*>(new_store.start());

232 for (int i = position_ - 1; i >= 0; i--) {

233 dst[i] = src[i];

234 }

235 if (new_store.start() != backing_store_.start()) {

236 backing_store_.Dispose();

237 backing_store_ = new_store;

238 }

239 position_ = new_content_size;

240 is_ascii_ = false;

241 }

242

243 bool is_ascii_;

244 int position_;

245 Vector<byte> backing_store_;

246

247 DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);

248 };

249

250

251 // ----------------------------------------------------------------------------

252 // Scanner base-class.

253

254 // Generic functionality used by both JSON and JavaScript scanners.

255 class Scanner {

256 public:

257 // -1 is outside of the range of any real source code.

258 static const int kNoOctalLocation = -1;

259

260 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;

261

262 class LiteralScope {

263 public:

264 explicit LiteralScope(Scanner* self);

265 ~LiteralScope();

266 void Complete();

267

268 private:

269 Scanner* scanner_;

270 bool complete_;

271 };

272

273 explicit Scanner(UnicodeCache* scanner_contants);

274

275 // Returns the current token again.

276 Token::Value current_token() { return current_.token; }

277

278 // One token look-ahead (past the token returned by Next()).

279 Token::Value peek() const { return next_.token; }

280

281 struct Location {

282 Location(int b, int e) : beg_pos(b), end_pos(e) { }

283 Location() : beg_pos(0), end_pos(0) { }

284

285 bool IsValid() const {

286 return beg_pos >= 0 && end_pos >= beg_pos;

287 }

288

289 static Location invalid() { return Location(-1, -1); }

290

291 int beg_pos;

292 int end_pos;

293 };

294

295 // Returns the location information for the current token

296 // (the token returned by Next()).

297 Location location() const { return current_.location; }

298 Location peek_location() const { return next_.location; }

299

300 // Returns the literal string, if any, for the current token (the

301 // token returned by Next()). The string is 0-terminated and in

302 // UTF-8 format; they may contain 0-characters. Literal strings are

303 // collected for identifiers, strings, and numbers.

304 // These functions only give the correct result if the literal

305 // was scanned between calls to StartLiteral() and TerminateLiteral().

306 bool is_literal_ascii() {

307 ASSERT_NOT_NULL(current_.literal_chars);

308 return current_.literal_chars->is_ascii();

309 }

310 Vector<const char> literal_ascii_string() {

311 ASSERT_NOT_NULL(current_.literal_chars);

312 return current_.literal_chars->ascii_literal();

313 }

314 Vector<const uc16> literal_uc16_string() {

315 ASSERT_NOT_NULL(current_.literal_chars);

316 return current_.literal_chars->uc16_literal();

317 }

318 int literal_length() const {

319 ASSERT_NOT_NULL(current_.literal_chars);

320 return current_.literal_chars->length();

321 }

322

323 bool literal_contains_escapes() const {

324 Location location = current_.location;

325 int source_length = (location.end_pos - location.beg_pos);

326 if (current_.token == Token::STRING) {

327 // Subtract delimiters.

328 source_length -= 2;

329 }

330 return current_.literal_chars->length() != source_length;

331 }

332

333 // Returns the literal string for the next token (the token that

334 // would be returned if Next() were called).

335 bool is_next_literal_ascii() {

336 ASSERT_NOT_NULL(next_.literal_chars);

337 return next_.literal_chars->is_ascii();

338 }

339 Vector<const char> next_literal_ascii_string() {

340 ASSERT_NOT_NULL(next_.literal_chars);

341 return next_.literal_chars->ascii_literal();

342 }

343 Vector<const uc16> next_literal_uc16_string() {

344 ASSERT_NOT_NULL(next_.literal_chars);

345 return next_.literal_chars->uc16_literal();

346 }

347 int next_literal_length() const {

348 ASSERT_NOT_NULL(next_.literal_chars);

349 return next_.literal_chars->length();

350 }

351

352 UnicodeCache* unicode_cache() { return unicode_cache_; }

353

354 static const int kCharacterLookaheadBufferSize = 1;

355

356 protected:

357 // The current and look-ahead token.

358 struct TokenDesc {

359 Token::Value token;

360 Location location;

361 LiteralBuffer* literal_chars;

362 };

363

364 // Call this after setting source_ to the input.

365 void Init() {

366 // Set c0_ (one character ahead)

367 STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);

368 Advance();

369 // Initialize current_ to not refer to a literal.

370 current_.literal_chars = NULL;

371 }

372

373 // Literal buffer support

374 inline void StartLiteral() {

375 LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ?

376 &literal_buffer2_ : &literal_buffer1_;

377 free_buffer->Reset();

378 next_.literal_chars = free_buffer;

379 }

380

381 inline void AddLiteralChar(uc32 c) {

382 ASSERT_NOT_NULL(next_.literal_chars);

383 next_.literal_chars->AddChar(c);

384 }

385

386 // Complete scanning of a literal.

387 inline void TerminateLiteral() {

388 // Does nothing in the current implementation.

389 }

390

391 // Stops scanning of a literal and drop the collected characters,

392 // e.g., due to an encountered error.

393 inline void DropLiteral() {

394 next_.literal_chars = NULL;

395 }

396

397 inline void AddLiteralCharAdvance() {

398 AddLiteralChar(c0_);

399 Advance();

400 }

401

402 // Low-level scanning support.

403 void Advance() { c0_ = source_->Advance(); }

404 void PushBack(uc32 ch) {

405 source_->PushBack(c0_);

406 c0_ = ch;

407 }

408

409 inline Token::Value Select(Token::Value tok) {

410 Advance();

411 return tok;

412 }

413

414 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {

415 Advance();

416 if (c0_ == next) {

417 Advance();

418 return then;

419 } else {

420 return else_;

421 }

422 }

423

424 uc32 ScanHexNumber(int expected_length);

425

426 // Return the current source position.

427 int source_pos() {

428 return source_->pos() - kCharacterLookaheadBufferSize;

429 }

430

431 UnicodeCache* unicode_cache_;

432

433 // Buffers collecting literal strings, numbers, etc.

434 LiteralBuffer literal_buffer1_;

435 LiteralBuffer literal_buffer2_;

436

437 TokenDesc current_; // desc for current token (as returned by Next())

438 TokenDesc next_; // desc for next token (one token look-ahead)

439

440 // Input stream. Must be initialized to an UC16CharacterStream.

441 UC16CharacterStream* source_;

442

443 // One Unicode character look-ahead; c0_ < 0 at the end of the input.

444 uc32 c0_;

445 };

446

447 // ----------------------------------------------------------------------------

448 // JavaScriptScanner - base logic for JavaScript scanning.

449

450 class JavaScriptScanner : public Scanner {

451 public:

452 // A LiteralScope that disables recording of some types of JavaScript

453 // literals. If the scanner is configured to not record the specific

454 // type of literal, the scope will not call StartLiteral.

455 class LiteralScope {

456 public:

457 explicit LiteralScope(JavaScriptScanner* self)

458 : scanner_(self), complete_(false) {

459 scanner_->StartLiteral();

460 }

461 ~LiteralScope() {

462 if (!complete_) scanner_->DropLiteral();

463 }

464 void Complete() {

465 scanner_->TerminateLiteral();

466 complete_ = true;

467 }

468

469 private:

470 JavaScriptScanner* scanner_;

471 bool complete_;

472 };

473

474 explicit JavaScriptScanner(UnicodeCache* scanner_contants);

475

476 void Initialize(UC16CharacterStream* source);

477

478 // Returns the next token.

479 Token::Value Next();

480

481 // Returns true if there was a line terminator before the peek'ed token,

482 // possibly inside a multi-line comment.

483 bool HasAnyLineTerminatorBeforeNext() const {

484 return has_line_terminator_before_next_ \|\|

485 has_multiline_comment_before_next_;

486 }

487

488 // Scans the input as a regular expression pattern, previous

489 // character(s) must be /(=). Returns true if a pattern is scanned.

490 bool ScanRegExpPattern(bool seen_equal);

491 // Returns true if regexp flags are scanned (always since flags can

492 // be empty).

493 bool ScanRegExpFlags();

494

495 // Tells whether the buffer contains an identifier (no escapes).

496 // Used for checking if a property name is an identifier.

497 static bool IsIdentifier(unibrow::CharacterStream* buffer);

498

499 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.

500 uc32 ScanOctalEscape(uc32 c, int length);

501

502 // Returns the location of the last seen octal literal

503 Location octal_position() const { return octal_pos_; }

504 void clear_octal_position() { octal_pos_ = Location::invalid(); }

505

506 // Seek forward to the given position. This operation does not

507 // work in general, for instance when there are pushed back

508 // characters, but works for seeking forward until simple delimiter

509 // tokens, which is what it is used for.

510 void SeekForward(int pos);

511

512 bool HarmonyBlockScoping() const {

513 return harmony_block_scoping_;

514 }

515 void SetHarmonyBlockScoping(bool block_scoping) {

516 harmony_block_scoping_ = block_scoping;

517 }

518

519

520 protected:

521 bool SkipWhiteSpace();

522 Token::Value SkipSingleLineComment();

523 Token::Value SkipMultiLineComment();

524

525 // Scans a single JavaScript token.

526 void Scan();

527

528 void ScanDecimalDigits();

529 Token::Value ScanNumber(bool seen_period);

530 Token::Value ScanIdentifierOrKeyword();

531 Token::Value ScanIdentifierSuffix(LiteralScope* literal);

532

533 void ScanEscape();

534 Token::Value ScanString();

535

536 // Scans a possible HTML comment -- begins with '<!'.

537 Token::Value ScanHtmlComment();

538

539 // Decodes a unicode escape-sequence which is part of an identifier.

540 // If the escape sequence cannot be decoded the result is kBadChar.

541 uc32 ScanIdentifierUnicodeEscape();

542 // Recognizes a uniocde escape-sequence and adds its characters,

543 // uninterpreted, to the current literal. Used for parsing RegExp

544 // flags.

545 bool ScanLiteralUnicodeEscape();

546

547 // Start position of the octal literal last scanned.

548 Location octal_pos_;

549

550 // Whether there is a line terminator whitespace character after

551 // the current token, and before the next. Does not count newlines

552 // inside multiline comments.

553 bool has_line_terminator_before_next_;

554 // Whether there is a multi-line comment that contains a

555 // line-terminator after the current token, and before the next.

556 bool has_multiline_comment_before_next_;

557 // Whether we scan 'let' as a keyword for harmony block scoped

558 // let bindings.

559 bool harmony_block_scoping_;

560 };

561

562 } } // namespace v8::internal

563

564 #endif // V8_SCANNER_BASE_H_

OLD	NEW

« no previous file with comments | « src/scanner.cc ('k') | src/scanner-base.cc » ('j') | no next file with comments »