src/parsing/scanner.h - Issue 2314663002: Rework scanner-character-streams.

Side by Side Diff: src/parsing/scanner.h

Issue 2314663002: Rework scanner-character-streams. (Closed)

Patch Set: Some fixes, and marching down the very long road to make all compilers happy. Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2011 the V8 project authors. All rights reserved.	1 // Copyright 2011 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // Features shared by parsing and pre-parsing scanners.	5 // Features shared by parsing and pre-parsing scanners.

6	6

7 #ifndef V8_PARSING_SCANNER_H_	7 #ifndef V8_PARSING_SCANNER_H_

8 #define V8_PARSING_SCANNER_H_	8 #define V8_PARSING_SCANNER_H_

9	9

10 #include "src/allocation.h"	10 #include "src/allocation.h"

11 #include "src/base/logging.h"	11 #include "src/base/logging.h"

12 #include "src/char-predicates.h"	12 #include "src/char-predicates.h"

13 #include "src/globals.h"	13 #include "src/globals.h"

14 #include "src/messages.h"	14 #include "src/messages.h"

15 #include "src/parsing/token.h"	15 #include "src/parsing/token.h"

16 #include "src/unicode-decoder.h"	16 #include "src/unicode-decoder.h"

17 #include "src/unicode.h"	17 #include "src/unicode.h"

18	18

19 namespace v8 {	19 namespace v8 {

20 namespace internal {	20 namespace internal {

21	21

22	22

23 class AstRawString;	23 class AstRawString;

24 class AstValueFactory;	24 class AstValueFactory;

25 class DuplicateFinder;	25 class DuplicateFinder;

	26 class ExternalOneByteString;

	27 class ExternalTwoByteString;

26 class ParserRecorder;	28 class ParserRecorder;

27 class UnicodeCache;	29 class UnicodeCache;

28	30

29

30 // ---------------------------------------------------------------------	31 // ---------------------------------------------------------------------

31 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.	32 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.

32 // A code unit is a 16 bit value representing either a 16 bit code point	33 // A code unit is a 16 bit value representing either a 16 bit code point

33 // or one part of a surrogate pair that make a single 21 bit code point.	34 // or one part of a surrogate pair that make a single 21 bit code point.

34

35 class Utf16CharacterStream {	35 class Utf16CharacterStream {

36 public:	36 public:

37 Utf16CharacterStream() : pos_(0) { }	37 static const uc32 kEndOfInput = -1;

	38

38 virtual ~Utf16CharacterStream() { }	39 virtual ~Utf16CharacterStream() { }

39	40

40 // Returns and advances past the next UTF-16 code unit in the input	41 // Returns and advances past the next UTF-16 code unit in the input

41 // stream. If there are no more code units, it returns a negative	42 // stream. If there are no more code units it returns kEndOfInput.

42 // value.

43 inline uc32 Advance() {	43 inline uc32 Advance() {

44 if (buffer_cursor_ < buffer_end_ \|\| ReadBlock()) {	44 if (V8_LIKELY(buffer_cursor_ < buffer_end_)) {

45 pos_++;

46 return static_cast<uc32>(*(buffer_cursor_++));	45 return static_cast<uc32>(*(buffer_cursor_++));

	46 } else if (ReadBlock()) {
	nickie 2016/09/07 13:28:28 I suppose you did not like how it was before: if ( I suppose you did not like how it was before: if (V8_LIKELY(...) \|\| ReadBlock()) vogelheim 2016/09/08 13:09:02 I liked it, but: V8_LIKELY supplies information ab Show quoted text On 2016/09/07 13:28:28, nickie wrote: > I suppose you did not like how it was before: > if (V8_LIKELY(...) \|\| ReadBlock()) I liked it, but: V8_LIKELY supplies information about the if-else to the compiler. I'm not sure the compiler can transfer this explicit likeli-hood information on a sub-expression to the branches it might generate from the surrounding expression. Admittedly, I intuited that, and I can't find documentation that is explicit about this. I've never seen a V8_(UN)LIKELY or __builtin_expect that did not cover the entire condition, though.
	47 return static_cast<uc32>(*(buffer_cursor_++));

	48 } else {

	49 // Note: currently the following increment is necessary to avoid a

	50 // parser problem! The scanner treats the final kEndOfInput as

	51 // a code unit with a position, and does math relative to that

	52 // position.

	53 buffer_cursor_++;

	54 return kEndOfInput;

47 }	55 }

48 // Note: currently the following increment is necessary to avoid a

49 // parser problem! The scanner treats the final kEndOfInput as

50 // a code unit with a position, and does math relative to that

51 // position.

52 pos_++;

53

54 return kEndOfInput;

55 }	56 }

56	57

57 // Return the current position in the code unit stream.	58 inline void Back() {
	nickie 2016/09/07 13:28:28 A comment explaining what this should do would be A comment explaining what this should do would be welcome. It's straightforward in the most likely case, not in the other one. vogelheim 2016/09/08 13:09:03 Done. Show quoted text On 2016/09/07 13:28:28, nickie wrote: > A comment explaining what this should do would be welcome. > It's straightforward in the most likely case, not in the other one. Done.
58 // Starts at zero.	59 if (V8_LIKELY(buffer_cursor_ > buffer_start_)) {

59 inline size_t pos() const { return pos_; }	60 buffer_cursor_--;
	marja 2016/09/07 09:17:57 Nit: --buffer_cursor_; Nit: --buffer_cursor_; vogelheim 2016/09/08 13:09:02 Why? [Here and elsewhere.] ------ I find the pre Show quoted text On 2016/09/07 09:17:57, marja wrote: > Nit: --buffer_cursor_; Why? [Here and elsewhere.] ------ I find the prefix-version odd to read. The style guide has no opinion: V8 style references Chromium style which references Google style, which says: "For simple scalar (non-object) values there is no reason to prefer one form and we allow either." [Ref: https://google.github.io/styleguide/cppguide.html#Preincrement_and_Predecrement ]
60	61 } else {

61 // Skips forward past the next code_unit_count UTF-16 code units	62 ReadBlockAt(pos() - 1);
	nickie 2016/09/07 13:28:28 We discussed this offline. This can make a series We discussed this offline. This can make a series of consecutive Back() pretty expensive. It could be "void Back(int num = 1)" and use "pos() - num" here. vogelheim 2016/09/08 13:09:02 That's almost the same as Seek. :) The Scanner on Show quoted text On 2016/09/07 13:28:28, nickie wrote: > We discussed this offline. This can make a series of consecutive Back() pretty > expensive. It could be "void Back(int num = 1)" and use "pos() - num" here. That's almost the same as Seek. :) The Scanner only uses num = 1..2, so I added a Back2().
62 // in the input, or until the end of input if that comes sooner.

63 // Returns the number of code units actually skipped. If less

64 // than code_unit_count,

65 inline size_t SeekForward(size_t code_unit_count) {

66 size_t buffered_chars = buffer_end_ - buffer_cursor_;

67 if (code_unit_count <= buffered_chars) {

68 buffer_cursor_ += code_unit_count;

69 pos_ += code_unit_count;

70 return code_unit_count;

71 }	63 }

72 return SlowSeekForward(code_unit_count);

73 }	64 }

74	65

75 // Pushes back the most recently read UTF-16 code unit (or negative	66 inline size_t pos() const {

76 // value if at end of input), i.e., the value returned by the most recent	67 return buffer_pos_ + (buffer_cursor_ - buffer_start_);
	nickie 2016/09/07 13:28:28 This calculation is quite expensive and the pos() This calculation is quite expensive and the pos() method is called very often from the parser. I suppose, the scanner delegates it here. Maybe it would make sense to measure this against having a separate current_pos_ field (like the pos_ in old line 59) and making sure that this is consistent. vogelheim 2016/09/08 13:09:02 Hmm. I think I should try it both ways. (My thin Show quoted text On 2016/09/07 13:28:28, nickie wrote: > This calculation is quite expensive and the pos() method is called very often > from the parser. I suppose, the scanner delegates it here. > > Maybe it would make sense to measure this against having a separate current_pos_ > field (like the pos_ in old line 59) and making sure that this is consistent. Hmm. I think I should try it both ways. (My thinking was: pos() is fully inline-able, and should be only 2 instructions on x64, on data that is already in cache (and oftentimes also in a register) at that point. The alternative, an additional variable + 1 instruction to maintain might not be faster.)
77 // call to Advance.	68 }

78 // Must not be used right after calling SeekForward.

79 virtual void PushBack(int32_t code_unit) = 0;

80	69

81 virtual bool SetBookmark();	70 inline void Seek(size_t pos) {

82 virtual void ResetToBookmark();	71 if (V8_LIKELY(pos >= buffer_pos_ &&

	72 pos < (buffer_pos_ + (buffer_end_ - buffer_start_)))) {

	73 buffer_cursor_ = buffer_start_ + (pos - buffer_pos_);

	74 } else {

	75 ReadBlockAt(pos);

	76 }

	77 }

	78

	79 // Legacy API:

	80 void SeekForward(size_t code_unit_count) { Seek(pos() + code_unit_count); }

	81 void PushBack(int32_t code_unit) {

	82 Back();

	83 #ifdef DEBUG

	84 uc32 t = Advance();

	85 DCHECK_EQ(t, code_unit);

	86 Back();
	nickie 2016/09/07 13:28:28 Why not simply this? DCHECK_EQ(code_unit, static_c Why not simply this? DCHECK_EQ(code_unit, static_cast<uc32>(*(buffer_cursor_++))); After Back() it is guaranteed that buffer_cursor_ < buffer_end_, isn't it? In a quite unfrequent case, this Advance-Back will result in an extra buffer loaded, right?
	87 #endif // DEBUG

	88 }

	89 bool SetBookmark() {
	nickie 2016/09/07 13:28:28 I don't understand why the bookmark should be stor I don't understand why the bookmark should be stored in the stream. A simpler interface would be to use "bookmark = pos()" directly to obtain a bookmark, and something like "Seek(bookmark)" to reset to that. If we still want to have streams that do not support bookmarking (?) then we could have a SetBookmark method returning either pos() or NO_BOOKMARK (see below). vogelheim 2016/09/08 13:09:03 You're exactly right: The bookmark doesn't belong Show quoted text On 2016/09/07 13:28:28, nickie wrote: > I don't understand why the bookmark should be stored in the stream. A simpler > interface would be to use "bookmark = pos()" directly to obtain a bookmark, and > something like "Seek(bookmark)" to reset to that. If we still want to have > streams that do not support bookmarking (?) then we could have a SetBookmark > method returning either pos() or NO_BOOKMARK (see below). You're exactly right: The bookmark doesn't belong in the stream (any more). But I was desperately trying to keep the CL small-ish, and hence wanted to avoid touching the clients of the stream just yet, and to remove the "// Legacy API" in a separate, upcoming CL. That would exactly what you suggest.
	90 bookmark_ = pos();

	91 return true;

	92 }

	93 void ResetToBookmark() {

	94 DCHECK(bookmark_ != (size_t)-1);
	nickie 2016/09/07 13:28:28 How about something like this? (with a better nam How about something like this? (with a better name, I guess) static const size_t NO_BOOKMARK = numeric_limits<size_t>::max(); vogelheim 2016/09/08 13:09:03 Done. Show quoted text On 2016/09/07 13:28:28, nickie wrote: > How about something like this? (with a better name, I guess) > static const size_t NO_BOOKMARK = numeric_limits<size_t>::max(); Done.
	95 Seek(bookmark_);

	96 }

83	97

84 protected:	98 protected:

85 static const uc32 kEndOfInput = -1;	99 Utf16CharacterStream(const uint16_t* buffer_start,

	100 const uint16_t* buffer_cursor,

	101 const uint16_t* buffer_end, size_t buffer_pos)

	102 : buffer_start_(buffer_start),

	103 buffer_cursor_(buffer_cursor),

	104 buffer_end_(buffer_end),

	105 buffer_pos_(buffer_pos),

	106 bookmark_((size_t)-1) {}
	nickie 2016/09/07 13:28:28 Again, NO_BOOKMARK here. Again, NO_BOOKMARK here. vogelheim 2016/09/08 13:09:03 Done. Show quoted text On 2016/09/07 13:28:28, nickie wrote: > Again, NO_BOOKMARK here. Done.
	107 Utf16CharacterStream() : Utf16CharacterStream(nullptr, nullptr, nullptr, 0) {}

	108

	109 void ReadBlockAt(size_t new_pos) {

	110 // This shouldn't ever be called if new_pos is inside the current buffer.

	111 DCHECK(new_pos < buffer_pos_ \|\|

	112 new_pos >= buffer_pos_ + (buffer_end_ - buffer_start_));

	113 buffer_pos_ = new_pos;

	114 buffer_cursor_ = buffer_start_;
	nickie 2016/09/07 13:28:28 I don't understand this. Maybe related to my next I don't understand this. Maybe related to my next comment. vogelheim 2016/09/08 13:09:02 See below. This wants to set the position for Rea Show quoted text On 2016/09/07 13:28:28, nickie wrote: > I don't understand this. Maybe related to my next comment. See below. This wants to set the position for ReadBlock() (which reads it from pos()), so it must change all the vars that make up pos() to accomplish this.
	115 ReadBlock();

	116 }

86	117

87 // Ensures that the buffer_cursor_ points to the code_unit at	118 // Ensures that the buffer_cursor_ points to the code_unit at

88 // position pos_ of the input, if possible. If the position	119 // position pos() of the input. Returns true if data is available; false if
	nickie 2016/09/07 13:28:29 How can this not be true? pos() is calculated in s How can this not be true? pos() is calculated in such a way to enforce that. I suppose this comment should change and describe more clearly what ReadBlock is expected to. vogelheim 2016/09/08 13:09:02 It's meant to explain the post-condition for ReadB Show quoted text On 2016/09/07 13:28:29, nickie wrote: > How can this not be true? > pos() is calculated in such a way to enforce that. > I suppose this comment should change and describe more clearly what ReadBlock is > expected to. It's meant to explain the post-condition for ReadBlock, and in particular remind the implementer that they can/must update both buffer_pos and buffer_start_. I think the source of confusion is that pos() is really a derivative of three values, so any method that updates any of those needs to take care to consider them jointly. I updated the comments to reflect this. ------ Maybe the best way of fixing this is to drop the ReadBlock() signature, and always use something like FillBuffer(size_t). If so, I'll do it in a separate step.
89 // is at or after the end of the input, return false. If there	120 // pos() is at (or after) the end of input

90 // are more code_units available, return true.

91 virtual bool ReadBlock() = 0;	121 virtual bool ReadBlock() = 0;

92 virtual size_t SlowSeekForward(size_t code_unit_count) = 0;

93	122

	123 const uint16_t* buffer_start_;

94 const uint16_t* buffer_cursor_;	124 const uint16_t* buffer_cursor_;

95 const uint16_t* buffer_end_;	125 const uint16_t* buffer_end_;

96 size_t pos_;	126 size_t buffer_pos_;

	127 size_t bookmark_;

97 };	128 };

98	129

99	130

100 // ----------------------------------------------------------------------------	131 // ----------------------------------------------------------------------------

101 // JavaScript Scanner.	132 // JavaScript Scanner.

102	133

103 class Scanner {	134 class Scanner {

104 public:	135 public:

105 // Scoped helper for a re-settable bookmark.	136 // Scoped helper for a re-settable bookmark.

106 class BookmarkScope {	137 class BookmarkScope {

(...skipping 24 matching lines...) Expand all Loading...
131 }	162 }

132	163

133 static Location invalid() { return Location(-1, -1); }	164 static Location invalid() { return Location(-1, -1); }

134	165

135 int beg_pos;	166 int beg_pos;

136 int end_pos;	167 int end_pos;

137 };	168 };

138	169

139 // -1 is outside of the range of any real source code.	170 // -1 is outside of the range of any real source code.

140 static const int kNoOctalLocation = -1;	171 static const int kNoOctalLocation = -1;

	172 static const uc32 kEndOfInput = Utf16CharacterStream::kEndOfInput;

141	173

142 explicit Scanner(UnicodeCache* scanner_contants);	174 explicit Scanner(UnicodeCache* scanner_contants);

143	175

144 void Initialize(Utf16CharacterStream* source);	176 void Initialize(Utf16CharacterStream* source);

145	177

146 // Returns the next token and advances input.	178 // Returns the next token and advances input.

147 Token::Value Next();	179 Token::Value Next();

148 // Returns the token following peek()	180 // Returns the token following peek()

149 Token::Value PeekAhead();	181 Token::Value PeekAhead();

150 // Returns the current token again.	182 // Returns the current token again.

(...skipping 641 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
792 bool found_html_comment_;	824 bool found_html_comment_;

793	825

794 MessageTemplate::Template scanner_error_;	826 MessageTemplate::Template scanner_error_;

795 Location scanner_error_location_;	827 Location scanner_error_location_;

796 };	828 };

797	829

798 } // namespace internal	830 } // namespace internal

799 } // namespace v8	831 } // namespace v8

800	832

801 #endif // V8_PARSING_SCANNER_H_	833 #endif // V8_PARSING_SCANNER_H_

OLD	NEW

« no previous file with comments | « src/parsing/parser.cc ('k') | src/parsing/scanner.cc » ('j') | src/parsing/scanner-character-streams.cc » ('J')