src/parsing/scanner.h - Issue 2314663002: Rework scanner-character-streams.

Unified Diff: src/parsing/scanner.h

Issue 2314663002: Rework scanner-character-streams. (Closed)

Patch Set: Marja's feedback. Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/parsing/scanner.h

diff --git a/src/parsing/scanner.h b/src/parsing/scanner.h

index cb37f7cc1a5dd7025579d03902a789c6c1e530bb..7d8f052558ae6e64adc6633012cf9895636bd7eb 100644

--- a/src/parsing/scanner.h

+++ b/src/parsing/scanner.h

@@ -23,77 +23,157 @@ namespace internal {

class AstRawString;

class AstValueFactory;

class DuplicateFinder;

+class ExternalOneByteString;

+class ExternalTwoByteString;

class ParserRecorder;

class UnicodeCache;

// ---------------------------------------------------------------------

// Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.

// A code unit is a 16 bit value representing either a 16 bit code point

// or one part of a surrogate pair that make a single 21 bit code point.

class Utf16CharacterStream {

public:

- Utf16CharacterStream() : pos_(0) { }

+ static const uc32 kEndOfInput = -1;

virtual ~Utf16CharacterStream() { }

// Returns and advances past the next UTF-16 code unit in the input

- // stream. If there are no more code units, it returns a negative

- // value.

+ // stream. If there are no more code units it returns kEndOfInput.

inline uc32 Advance() {

- if (buffer_cursor_ < buffer_end_ || ReadBlock()) {

- pos_++;

+ if (V8_LIKELY(buffer_cursor_ < buffer_end_)) {

return static_cast<uc32>(*(buffer_cursor_++));

+ } else if (ReadBlock()) {

+ return static_cast<uc32>(*(buffer_cursor_++));

+ } else {

+ // Note: currently the following increment is necessary to avoid a

+ // parser problem! The scanner treats the final kEndOfInput as

+ // a code unit with a position, and does math relative to that

+ // position.

+ buffer_cursor_++;

+ return kEndOfInput;

}

- // Note: currently the following increment is necessary to avoid a

- // parser problem! The scanner treats the final kEndOfInput as

- // a code unit with a position, and does math relative to that

- // position.

- pos_++;

- return kEndOfInput;

- }

- // Return the current position in the code unit stream.

- // Starts at zero.

- inline size_t pos() const { return pos_; }

- // Skips forward past the next code_unit_count UTF-16 code units

- // in the input, or until the end of input if that comes sooner.

- // Returns the number of code units actually skipped. If less

- // than code_unit_count,

- inline size_t SeekForward(size_t code_unit_count) {

- size_t buffered_chars = buffer_end_ - buffer_cursor_;

- if (code_unit_count <= buffered_chars) {

- buffer_cursor_ += code_unit_count;

- pos_ += code_unit_count;

- return code_unit_count;

+ }

+ // Go back one by one character in the input stream.

+ // This undoes the most recent Advance().

+ inline void Back() {

+ // The common case - if the previous character is within

+ // buffer_start_ .. buffer_end_ will be handles locally.

+ // Otherwise, a new block is requested.

+ if (V8_LIKELY(buffer_cursor_ > buffer_start_)) {

+ buffer_cursor_--;

+ } else {

+ ReadBlockAt(pos() - 1);

}

- return SlowSeekForward(code_unit_count);

}

- // Pushes back the most recently read UTF-16 code unit (or negative

- // value if at end of input), i.e., the value returned by the most recent

- // call to Advance.

- // Must not be used right after calling SeekForward.

- virtual void PushBack(int32_t code_unit) = 0;

+ // Go back one by two characters in the input stream. (This is the same as

+ // calling Back() twice. But Back() may - in some instances - do substantial

+ // work. Back2() guarantees this work will be done only once.)

+ inline void Back2() {

+ if (V8_LIKELY(buffer_cursor_ - 2 >= buffer_start_)) {

+ buffer_cursor_ -= 2;

+ } else {

+ ReadBlockAt(pos() - 2);

+ }

- virtual bool SetBookmark();

- virtual void ResetToBookmark();

+ inline size_t pos() const {

+ return buffer_pos_ + (buffer_cursor_ - buffer_start_);

+ }

- protected:

- static const uc32 kEndOfInput = -1;

+ inline void Seek(size_t pos) {

+ if (V8_LIKELY(pos >= buffer_pos_ &&

+ pos < (buffer_pos_ + (buffer_end_ - buffer_start_)))) {

+ buffer_cursor_ = buffer_start_ + (pos - buffer_pos_);

+ } else {

+ ReadBlockAt(pos);

+ }

+ // Legacy API:

+ void SeekForward(size_t code_unit_count) { Seek(pos() + code_unit_count); }

+ void PushBack(int32_t code_unit) {

+ Back();

+#ifdef DEBUG

+ uc32 t = Advance();

+ DCHECK_EQ(t, code_unit);

+ Back();

+#endif // DEBUG

+ }

+ void PushBack2(int32_t code_unit_back_1, int32_t code_unit_back_2) {

+ Back2();

+#ifdef DEBUG

+ DCHECK_EQ(Advance(), code_unit_back_2);

+ DCHECK_EQ(Advance(), code_unit_back_1);

+ Back2();

+#endif // DEBUG

+ }

+ bool SetBookmark() {

+ bookmark_ = pos();

+ return true;

+ }

+ void ResetToBookmark() {

+ DCHECK_NE(bookmark_, kNoBookmark);

+ Seek(bookmark_);

+ }

- // Ensures that the buffer_cursor_ points to the code_unit at

- // position pos_ of the input, if possible. If the position

- // is at or after the end of the input, return false. If there

- // are more code_units available, return true.

+ protected:

+ static const size_t kNoBookmark;

+ Utf16CharacterStream(const uint16_t* buffer_start,

+ const uint16_t* buffer_cursor,

+ const uint16_t* buffer_end, size_t buffer_pos)

+ : buffer_start_(buffer_start),

+ buffer_cursor_(buffer_cursor),

+ buffer_end_(buffer_end),

+ buffer_pos_(buffer_pos),

+ bookmark_(kNoBookmark) {}

+ Utf16CharacterStream() : Utf16CharacterStream(nullptr, nullptr, nullptr, 0) {}

+ void ReadBlockAt(size_t new_pos) {

+ // The callers of this method (Back/Back2/Seek) should handle the easy

+ // case (seeking within the current buffer), and we should only get here

+ // if we actually require new data.

+ // (This is really an efficiency check, not a correctness invariant.)

+ DCHECK(new_pos < buffer_pos_ ||

+ new_pos >= buffer_pos_ + (buffer_end_ - buffer_start_));

+ // Change pos() to point to new_pos.

+ buffer_pos_ = new_pos;

+ buffer_cursor_ = buffer_start_;

+ bool success = ReadBlock();

+ USE(success);

+ // Post-conditions: 1, on success, we should be at the right position.

+ // 2, success == we should have more characters available.

+ DCHECK_IMPLIES(success, pos() == new_pos);

+ DCHECK_EQ(success, buffer_cursor_ < buffer_end_);

+ DCHECK_EQ(success, buffer_start_ < buffer_end_);

+ }

+ // Read more data, and update buffer_*_ to point to it.

+ // Returns true if more data was available.

+ //

+ // ReadBlock() may modify any of the buffer_*_ members, but must sure that

+ // the result of pos() remains unaffected.

+ //

+ // Examples:

+ // - a stream could either fill a separate buffer. Then buffer_start_ and

+ // buffer_cursor_ would point to the beginning of the buffer, and

+ // buffer_pos would be the old pos().

+ // - a stream with existing buffer chunks would set buffer_start_ and

+ // buffer_end_ to cover the full chunk, and then buffer_cursor_ would

+ // point into the middle of the buffer, while buffer_pos_ would describe

+ // the start of the buffer.

virtual bool ReadBlock() = 0;

- virtual size_t SlowSeekForward(size_t code_unit_count) = 0;

+ const uint16_t* buffer_start_;

const uint16_t* buffer_cursor_;

const uint16_t* buffer_end_;

- size_t pos_;

+ size_t buffer_pos_;

+ size_t bookmark_;

};

@@ -138,6 +218,7 @@ class Scanner {

// -1 is outside of the range of any real source code.

static const int kNoOctalLocation = -1;

+ static const uc32 kEndOfInput = Utf16CharacterStream::kEndOfInput;

explicit Scanner(UnicodeCache* scanner_contants);

« no previous file with comments | « src/parsing/parser.cc ('k') | src/parsing/scanner.cc » ('j') | no next file with comments »